<a href="https://colab.research.google.com/github/MoritzLaurer/zeroshot-classifier/blob/main/1_data_harmonization_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download and harmonize multiple datasets from the Hugging Face hub


### Install and setup

In [1]:
!pip install datasets~=2.14.0 -qqq

In [None]:
## load packages

import pandas as pd
import numpy as np
import os
from datasets import load_dataset
import torch

from sklearn.model_selection import train_test_split

from google.colab.data_table import DataTable
from google.colab import data_table
from IPython.display import display
data_table.enable_dataframe_formatter() # https://colab.research.google.com/notebooks/data_table.ipynb#scrollTo=JgBtx0xFFv_i

# set global seed for reproducibility and against seed hacking
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)


In [None]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

#set wd
print(os.getcwd())
os.chdir("/content/drive/My Drive/PhD/zero-shot-models")
print(os.getcwd())

### Download and harmonize datasets

#### Well-formed query dataset

In [None]:
dataset_wellformedquery = load_dataset("google_wellformed_query")
print("Raw dataset structure:\n", dataset_wellformedquery)

def prompt_format_wellformed_query(label):
    if label >= 0.75:
        label_text = "well_formed"
    elif label <= 0.35:
        label_text = "not_well_formed"
    else:
        label_text = np.nan

    return label_text


## prepare df_train
df_wellformedquery_train = pd.concat([
    dataset_wellformedquery["train"].to_pandas(),
    dataset_wellformedquery["validation"].to_pandas()
])

df_wellformedquery_train["label_text"] = df_wellformedquery_train.rating.apply(prompt_format_wellformed_query)

# remove na
df_wellformedquery_train = df_wellformedquery_train[~pd.isna(df_wellformedquery_train.label_text)]

# remove duplicates
df_wellformedquery_train = df_wellformedquery_train.rename(columns={"content": "text"})
df_wellformedquery_train = df_wellformedquery_train[~df_wellformedquery_train.text.duplicated()]

# create standard label column
df_wellformedquery_train["label_standard"] = df_wellformedquery_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_wellformedquery_train))
n_data_per_label = 10_000
df_wellformedquery_train = df_wellformedquery_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length after downsampling: ", len(df_wellformedquery_train))

# final harmonized format
df_wellformedquery_train = df_wellformedquery_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_wellformedquery_train.label_text.value_counts())
display(df_wellformedquery_train)



In [None]:
## prepare df_test
df_wellformedquery_test = dataset_wellformedquery["test"].to_pandas()
df_wellformedquery_test["label_text"] = df_wellformedquery_test.rating.apply(prompt_format_wellformed_query)
df_wellformedquery_test = df_wellformedquery_test[~pd.isna(df_wellformedquery_test.label_text)]
df_wellformedquery_test = df_wellformedquery_test.rename(columns={"content": "text"})
df_wellformedquery_test["label_standard"] = df_wellformedquery_test.label_text.factorize(sort=True)[0]
df_wellformedquery_test = df_wellformedquery_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_wellformedquery_test)

In [None]:
# save harmonized data to disk
dataset_name = "wellformedquery"
df_wellformedquery_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_wellformedquery_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### Social bias frames dataset

In [None]:
# dataset https://huggingface.co/datasets/social_bias_frames
# explanation of variables https://huggingface.co/datasets/social_bias_frames#default-1

from datasets import Dataset, DatasetDict, concatenate_datasets
import html

dataset_bias_frames = load_dataset("social_bias_frames")
print("Raw dataset structure:\n", dataset_bias_frames)

# concatenate train and validation split
dataset_bias_frames["train"] = concatenate_datasets([dataset_bias_frames["train"], dataset_bias_frames["validation"]])

# average annotations of 3 annotators per text
dataset_bias_frames_mean = DatasetDict()
for split in ["train", "test"]:
    df_bias_frames = dataset_bias_frames[split].to_pandas()

    # convert relevant column to numeric to enable aggregation
    columns_to_convert = ['intentYN', 'sexYN', 'offensiveYN', 'whoTarget', 'speakerMinorityYN']
    df_bias_frames[columns_to_convert] = df_bias_frames[columns_to_convert].apply(pd.to_numeric, errors='coerce')

    # aggregate judgement of different annotators
    df_bias_frames_mean = df_bias_frames.groupby("post", as_index=False, group_keys=False)[
        ["intentYN", "sexYN", "offensiveYN",'whoTarget', "speakerMinorityYN"]
        ].mean(numeric_only=True).reset_index(drop=True)

    # clean html tags
    #df_bias_frames['post'] = df_bias_frames['post'].apply(html.unescape)

    dataset_bias_frames_mean[split] = Dataset.from_pandas(df_bias_frames_mean)

print(dataset_bias_frames_mean)


In [None]:
# decide on binary label based on threshold

def prompt_format_bias_frames(example, column_label=None, label_text=None):
    if (example[column_label] is np.nan) or (example[column_label] is None):
        label = np.nan
        label_text = None
    elif example[column_label] >= 0.70:
        label = 1
        label_text = label_text
    elif example[column_label] <= 0.30:
        label = 0
        label_text = "not_" + label_text
    else:
        label = np.nan
        label_text = None

    # remove HTML tags
    text = html.unescape(example["post"])

    return {"text": text, "label_text": label_text, "label_standard": label}

# create cleaned datasets for each subtask
dataset_bias_frames_dic = {}
for task in ["sexYN", "intentYN", "offensiveYN"]:
    label_text = task.replace("YN", "")
    dataset_bias_frames_mean_task = dataset_bias_frames_mean.map(
        lambda example: prompt_format_bias_frames(example, column_label=task, label_text=label_text)
    )
    columns_to_remove = [col for col in dataset_bias_frames_mean_task["train"].column_names if col not in ["text", "label_text", "label_standard"]]
    dataset_bias_frames_mean_task = dataset_bias_frames_mean_task.remove_columns(columns_to_remove)

    # remove nans
    dataset_bias_frames_mean_task = dataset_bias_frames_mean_task.filter(lambda example: not np.isnan(example['label_standard']))
    #dataset_bias_frames_mean_task = dataset_bias_frames_mean_task.shuffle(seed=SEED_GLOBAL)

    # downsample
    dataset_bias_frames_mean_task_dic = {}
    for split in ["train", "test"]:
        print("Dataset length before downsampling: ", len(dataset_bias_frames_mean_task[split]))
        n_data_per_label = 10_000
        df_bias_frames_mean_task_split = dataset_bias_frames_mean_task[split].to_pandas().groupby("label_text", as_index=False, group_keys=False).apply(
            lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
        )
        print("Dataset length after downsampling: ", len(df_bias_frames_mean_task_split))
        # shuffle
        df_bias_frames_mean_task_split = df_bias_frames_mean_task_split.sample(frac=1, random_state=SEED_GLOBAL)
        # back to dataset
        dataset_bias_frames_mean_task_dic.update({split: Dataset.from_pandas(df_bias_frames_mean_task_split.reset_index(drop=True))})
    dataset_bias_frames_mean_task = DatasetDict(dataset_bias_frames_mean_task_dic)

    # label_standard to int
    dataset_bias_frames_mean_task = dataset_bias_frames_mean_task.map(lambda x: {"label_standard": int(x["label_standard"])})

    print("Dataset for task: ", task)
    print(dataset_bias_frames_mean_task)
    print("Label distribution in train:\n", dataset_bias_frames_mean_task["train"].to_pandas().label_text.value_counts())
    print("\n")

    dataset_bias_frames_dic.update({label_text: dataset_bias_frames_mean_task})

print("Full dataset structure:\n", dataset_bias_frames_dic, "\n")

print("Offensive dataset example: ")
display(dataset_bias_frames_dic["offensive"]["train"].to_pandas().head(100))

In [None]:
# deletable test
#dataset_bias_frames_dic["sex"]["train"].to_pandas().label_standard.value_counts()
#dataset_bias_frames_dic["sex"]["train"].to_pandas().label_standard.dtype

In [None]:
# save harmonized data to disk
dataset_name = "biasframes"
for key_task, value_dataset in dataset_bias_frames_dic.items():
    df_bias_frames_task_train = dataset_bias_frames_dic[key_task]["train"].to_pandas()
    df_bias_frames_task_test = dataset_bias_frames_dic[key_task]["test"].to_pandas()
    # label to int. is float for some reason
    df_bias_frames_task_train["label_standard"] = df_bias_frames_task_train.label_standard.apply(int)
    df_bias_frames_task_test["label_standard"] = df_bias_frames_task_test.label_standard.apply(int)
    # remove html tags, didn't work above in dataset?
    df_bias_frames_task_train['text'] = df_bias_frames_task_train['text'].apply(html.unescape)
    df_bias_frames_task_test['text'] = df_bias_frames_task_test['text'].apply(html.unescape)
    # to disk
    df_bias_frames_task_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_{key_task}_train.gzip", compression='gzip')
    df_bias_frames_task_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_{key_task}_test.gzip", compression='gzip')

#### Financial phrasebank dataset

In [None]:
dataset_financialphrasebank = load_dataset("financial_phrasebank", "sentences_75agree")
# only has train split
print("Raw dataset structure:\n", dataset_financialphrasebank)

label_mapping_financialphrasebank = {
    idx: name for idx, name in enumerate(dataset_financialphrasebank["train"].features["label"].names)
}

## prepare df_train
df_financialphrasebank_train = dataset_financialphrasebank["train"].to_pandas()


df_financialphrasebank_train["label_text"] = df_financialphrasebank_train.label.map(label_mapping_financialphrasebank)

df_financialphrasebank_train = df_financialphrasebank_train.rename(columns={"sentence": "text"})
df_financialphrasebank_train = df_financialphrasebank_train[~df_financialphrasebank_train.text.duplicated()]

df_financialphrasebank_train["label_standard"] = df_financialphrasebank_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_financialphrasebank_train))
n_data_per_label = 10_000
df_financialphrasebank_train = df_financialphrasebank_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_financialphrasebank_train))

# final harmonized format
df_financialphrasebank_train = df_financialphrasebank_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_financialphrasebank_train.label_text.value_counts())
display(df_financialphrasebank_train)



In [None]:
## prepare df_test
# no test set
from sklearn.model_selection import train_test_split

df_financialphrasebank_train_split, df_financialphrasebank_test_split = train_test_split(
    df_financialphrasebank_train, test_size=0.2, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_financialphrasebank_train["label_text"]
)
print(len(df_financialphrasebank_train_split))
print(len(df_financialphrasebank_test_split))


In [None]:
# save harmonized data to disk
dataset_name = "financialphrasebank"
df_financialphrasebank_train_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_financialphrasebank_test_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')


#### Rotten Tomatoes dataset

In [None]:
dataset_rottentomatoes = load_dataset("rotten_tomatoes")
print("Raw dataset structure:\n", dataset_rottentomatoes)

label_mapping_rottentomatoes = {
    idx: name for idx, name in enumerate(dataset_rottentomatoes["train"].features["label"].names)
}

## prepare df_train
df_rottentomatoes_train = pd.concat([
    dataset_rottentomatoes["train"].to_pandas(),
    dataset_rottentomatoes["validation"].to_pandas()
])

df_rottentomatoes_train["label_text"] = df_rottentomatoes_train.label.map(label_mapping_rottentomatoes)
df_rottentomatoes_train["label_text"] = df_rottentomatoes_train["label_text"].map({"neg": "negative", "pos": "positive"})

df_rottentomatoes_train = df_rottentomatoes_train[~df_rottentomatoes_train.text.duplicated()]

df_rottentomatoes_train["label_standard"] = df_rottentomatoes_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_rottentomatoes_train))
n_data_per_label = 10_000
df_rottentomatoes_train = df_rottentomatoes_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_rottentomatoes_train))

# final harmonized format
df_rottentomatoes_train = df_rottentomatoes_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_rottentomatoes_train.label_text.value_counts())
display(df_rottentomatoes_train)



In [None]:
## prepare df_test
df_rottentomatoes_test = dataset_rottentomatoes["test"].to_pandas()
df_rottentomatoes_test["label_text"] = df_rottentomatoes_test.label.map(label_mapping_rottentomatoes)
df_rottentomatoes_test["label_text"] = df_rottentomatoes_test["label_text"].map({"neg": "negative", "pos": "positive"})
df_rottentomatoes_test["label_standard"] = df_rottentomatoes_test.label_text.factorize(sort=True)[0]
df_rottentomatoes_test = df_rottentomatoes_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_rottentomatoes_test)

In [None]:
# save harmonized data to disk
dataset_name = "rottentomatoes"
df_rottentomatoes_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_rottentomatoes_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### Amazon polarity dataset

In [None]:
dataset_amazonpolarity = load_dataset("amazon_polarity")
print("Raw dataset structure:\n", dataset_amazonpolarity)

# merge title and content for text column
dataset_amazonpolarity = dataset_amazonpolarity.map(lambda x: {"text": x["title"] + "\n" + x["content"]})

label_mapping_amazonpolarity = {
    idx: name for idx, name in enumerate(dataset_amazonpolarity["train"].features["label"].names)
}

## prepare df_train
df_amazonpolarity_train = dataset_amazonpolarity["train"].to_pandas()

df_amazonpolarity_train["label_text"] = df_amazonpolarity_train.label.map(label_mapping_amazonpolarity)

df_amazonpolarity_train = df_amazonpolarity_train[~df_amazonpolarity_train.text.duplicated()]

df_amazonpolarity_train["label_standard"] = df_amazonpolarity_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_amazonpolarity_train))
n_data_per_label = 10_000
df_amazonpolarity_train = df_amazonpolarity_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_amazonpolarity_train))

# final harmonized format
df_amazonpolarity_train = df_amazonpolarity_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_amazonpolarity_train.label_text.value_counts())
display(df_amazonpolarity_train)



In [None]:
## prepare df_test
df_amazonpolarity_test = dataset_amazonpolarity["test"].to_pandas()
df_amazonpolarity_test["label_text"] = df_amazonpolarity_test.label.map(label_mapping_amazonpolarity)
df_amazonpolarity_test["label_standard"] = df_amazonpolarity_test.label_text.factorize(sort=True)[0]
df_amazonpolarity_test = df_amazonpolarity_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_amazonpolarity_test)

In [None]:
# save harmonized data to disk
dataset_name = "amazonpolarity"
df_amazonpolarity_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_amazonpolarity_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### IMDB dataset

In [None]:
dataset_imdb = load_dataset("imdb")
print("Raw dataset structure:\n", dataset_imdb)

label_mapping_imdb = {
    idx: name for idx, name in enumerate(dataset_imdb["train"].features["label"].names)
}

## prepare df_train
df_imdb_train = dataset_imdb["train"].to_pandas()

df_imdb_train["label_text"] = df_imdb_train.label.map(label_mapping_imdb)

df_imdb_train["label_text"] = df_imdb_train["label_text"].map({"pos": "positive", "neg": "negative"})

df_imdb_train = df_imdb_train[~df_imdb_train.text.duplicated()]

df_imdb_train["label_standard"] = df_imdb_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_imdb_train))
n_data_per_label = 10_000
df_imdb_train = df_imdb_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_imdb_train))

# final harmonized format
df_imdb_train = df_imdb_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_imdb_train.label_text.value_counts())
display(df_imdb_train)



In [None]:
## prepare df_test
df_imdb_test = dataset_imdb["test"].to_pandas()
df_imdb_test["label_text"] = df_imdb_test.label.map(label_mapping_imdb)
df_imdb_test["label_text"] = df_imdb_test["label_text"].map({"pos": "positive", "neg": "negative"})
df_imdb_test["label_standard"] = df_imdb_test.label_text.factorize(sort=True)[0]
df_imdb_test = df_imdb_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_imdb_test)

In [None]:
# save harmonized data to disk
dataset_name = "imdb"
df_imdb_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_imdb_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### App Reviews dataset

In [None]:
dataset_appreviews = load_dataset("app_reviews")
# only has train set
print("Raw dataset structure:\n", dataset_appreviews)

# remove very short reviews
dataset_appreviews['train'] = dataset_appreviews['train'].filter(lambda x: len(x["review"]) >= 100 and len(x["review"]) <= 1500)

df_appreviews_train = dataset_appreviews["train"].to_pandas()

# convert 5-star scale into binary task, because only silver labels
def label_mapping_appreviews(label):
    # star range: 1 - 5
    if label == 5:
        label_text = "positive"
    elif label <= 2:
        label_text = "negative"
    elif (label == 3) or (label == 4):
        # too unclear / culture dependent what 3 or 4 stars represent
        label_text = None  #"mixed"
    else:
        raise Exception("Something went wrong when translating numeric to verbal labels")

    return label_text


df_appreviews_train["label_text"] = df_appreviews_train.star.map(
    lambda example: label_mapping_appreviews(example)
)
df_appreviews_train = df_appreviews_train[df_appreviews_train.label_text != None]

df_appreviews_train = df_appreviews_train.rename(columns={"review": "text"})
df_appreviews_train = df_appreviews_train[~df_appreviews_train.text.duplicated()]

df_appreviews_train["label_standard"] = df_appreviews_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_appreviews_train))
n_data_per_label = 10_000
df_appreviews_train = df_appreviews_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length after downsampling: ", len(df_appreviews_train))

# final harmonized format
df_appreviews_train = df_appreviews_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_appreviews_train.label_text.value_counts())
display(df_appreviews_train)



In [None]:
## prepare df_test
# no test set
from sklearn.model_selection import train_test_split

df_appreviews_train_split, df_appreviews_test_split = train_test_split(
    df_appreviews_train, test_size=0.2, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_appreviews_train["label_text"]
)
print(len(df_appreviews_train_split))
print(len(df_appreviews_test_split))


In [None]:
# save harmonized data to disk
dataset_name = "appreviews"
df_appreviews_train_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_appreviews_test_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')


#### Yelp review dataset

In [None]:
dataset_yelpreviews = load_dataset("yelp_review_full")
print("Raw dataset structure:\n", dataset_yelpreviews)

## prepare df_train
df_yelpreviews_train = dataset_yelpreviews["train"].to_pandas()

# convert 5-star scale into binary task, because only silver labels
def label_mapping_yelpreviews(label):
    # star range: 0 - 4
    if label == 4:
        label_text = "positive"
    elif label <= 1:
        label_text = "negative"
    elif (label == 2) or (label == 3):
        # too unclear / culture dependent what 2 or 3 stars~ represent
        label_text = None  #"mixed"
    else:
        raise Exception("Something went wrong when translating numeric to verbal labels")

    return label_text


df_yelpreviews_train["label_text"] = df_yelpreviews_train.label.map(
    lambda example: label_mapping_yelpreviews(example)
)
df_yelpreviews_train = df_yelpreviews_train[df_yelpreviews_train.label_text != None]

df_yelpreviews_train = df_yelpreviews_train[~df_yelpreviews_train.text.duplicated()]

df_yelpreviews_train["label_standard"] = df_yelpreviews_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_yelpreviews_train))
n_data_per_label = 10_000
df_yelpreviews_train = df_yelpreviews_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_yelpreviews_train))

# final harmonized format
df_yelpreviews_train = df_yelpreviews_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_yelpreviews_train.label_text.value_counts())
display(df_yelpreviews_train)



In [None]:
## prepare df_test
df_yelpreviews_test = dataset_yelpreviews["test"].to_pandas()
df_yelpreviews_test["label_text"] = df_yelpreviews_test.label.map(
    lambda example: label_mapping_yelpreviews(example)
)
df_yelpreviews_test = df_yelpreviews_train[df_yelpreviews_test.label_text != None]

df_yelpreviews_test["label_standard"] = df_yelpreviews_test.label_text.factorize(sort=True)[0]
df_yelpreviews_test = df_yelpreviews_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_yelpreviews_test)

In [None]:
# save harmonized data to disk
dataset_name = "yelpreviews"
df_yelpreviews_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_yelpreviews_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### Wiki Toxic dataset

In [None]:
import pandas as pd
from datasets import DatasetDict, Dataset

# hf version aggregated relevant sub-tasks. I'm using the original data instead
#from datasets import load_dataset
#dataset_wiki_toxic = load_dataset("OxAISH-AL-LLM/wiki_toxic")
#dataset_wiki_toxic["balanced_train"].to_pandas().head()

# data source: https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data
df_wiki_toxic_train = pd.read_csv("./datasets_raw/wiki_toxic_train.csv.zip")
df_wiki_toxic_train = df_wiki_toxic_train.rename(columns={"comment_text": "text"})

# df_test merge texts with labels
df_wiki_toxic_test1 = pd.read_csv("./datasets_raw/wiki_toxic_test_labels.csv.zip")
df_wiki_toxic_test2 = pd.read_csv("./datasets_raw/wiki_toxic_test.csv.zip")
df_wiki_toxic_test = df_wiki_toxic_test2.merge(df_wiki_toxic_test1, on="id")
# remove unannotated texts (indicated by -1 label)
df_wiki_toxic_test = df_wiki_toxic_test[df_wiki_toxic_test["toxic"] != -1].reset_index(drop=True)
df_wiki_toxic_test = df_wiki_toxic_test.rename(columns={"comment_text": "text"})

# aggregate the toxic and severe_toxic labels into one label
df_wiki_toxic_train["toxicaggregated"] = [1 if (toxic == 1) or (severe_toxic == 1) else 0 for toxic, severe_toxic in zip(df_wiki_toxic_train.toxic, df_wiki_toxic_train.severe_toxic)]
df_wiki_toxic_test["toxicaggregated"] = [1 if (toxic == 1) or (severe_toxic == 1) else 0 for toxic, severe_toxic in zip(df_wiki_toxic_test.toxic, df_wiki_toxic_test.severe_toxic)]

df_wiki_toxic_train = df_wiki_toxic_train.rename(columns={"identity_hate": "identityhate"})
df_wiki_toxic_test = df_wiki_toxic_test.rename(columns={"identity_hate": "identityhate"})

# careful: the dataset is multilabel
# I convert it to multiple different binary tasks
df_wiki_toxic_dic = {}
for col in ["toxicaggregated", "obscene", "threat", "insult", "identityhate"]:
    # train
    # take balanced sample. "Other" class is much bigger (label 0). reduce it to 2 * size of positive class (label 1).
    len_positive_class = 2*len(df_wiki_toxic_train[df_wiki_toxic_train[col] == 1])
    max_sample = len_positive_class if len_positive_class < 10_000 else 10_000
    df_wiki_toxic_train_col_balanced = df_wiki_toxic_train.groupby(col, as_index=False, group_keys=False).apply(lambda x: x.sample(min(len(x), max_sample), random_state=SEED_GLOBAL))
    # add label_text col
    df_wiki_toxic_train_col_balanced.loc[df_wiki_toxic_train_col_balanced[col] == 1, "label_text"] = col
    df_wiki_toxic_train_col_balanced.loc[df_wiki_toxic_train_col_balanced[col] == 0, "label_text"] = "not_" + col
    df_wiki_toxic_train_col_balanced = df_wiki_toxic_train_col_balanced[["text", "label_text", col]].rename(columns={col: "label_standard"}).reset_index(drop=True)
    df_wiki_toxic_train_col_balanced = df_wiki_toxic_train_col_balanced.sample(frac=1, random_state=SEED_GLOBAL)
    # test
    df_wiki_toxic_test_col = df_wiki_toxic_test
    # add label_text col
    df_wiki_toxic_test_col.loc[df_wiki_toxic_test_col[col] == 1, "label_text"] = col
    df_wiki_toxic_test_col.loc[df_wiki_toxic_test_col[col] == 0, "label_text"] = "not_" + col
    df_wiki_toxic_test_col = df_wiki_toxic_test_col[["text", "label_text", col]].rename(columns={col: "label_standard"}).reset_index(drop=True)
    # append to dic
    df_wiki_toxic_dic.update({col: {"train": df_wiki_toxic_train_col_balanced, "test": df_wiki_toxic_test_col}})
    print("Label distribution in dataset:\n", df_wiki_toxic_train_col_balanced.label_text.value_counts(), "\n")

# show example data
display(df_wiki_toxic_dic["toxicaggregated"]["train"].head())

In [None]:
# save harmonized data to disk
dataset_name = "wikitoxic"
for col in ["toxicaggregated", "obscene", "threat", "insult", "identityhate"]:
    df_wiki_toxic_dic[col]["train"].to_parquet(f"./datasets_standardized/ds_{dataset_name}_{col}_train.gzip", compression='gzip')
    df_wiki_toxic_dic[col]["test"].to_parquet(f"./datasets_standardized/ds_{dataset_name}_{col}_test.gzip", compression='gzip')

#### Hate Speech Offensive dataset

In [None]:
from datasets import load_dataset

dataset_hate_offensive = load_dataset("hate_speech_offensive")
# dataset only has train split
print("Raw dataset structure:\n", dataset_hate_offensive)

df_hate_offensive_train = dataset_hate_offensive["train"].to_pandas()

# only keep rows where at least 3 annotators agreed on same label
df_hate_offensive_train = df_hate_offensive_train[
    (df_hate_offensive_train.hate_speech_count >= 3) |
    (df_hate_offensive_train.offensive_language_count >= 3) |
    (df_hate_offensive_train.neither_count >= 3)
]
print("Number of texts after selecting higher quality: ", len(df_hate_offensive_train))

# clean html tags
import html
df_hate_offensive_train['tweet'] = df_hate_offensive_train['tweet'].apply(html.unescape)

df_hate_offensive_train = df_hate_offensive_train.rename(columns={"tweet": "text"})

df_hate_offensive_train = df_hate_offensive_train[~df_hate_offensive_train.text.duplicated()]

label_text_map_hate_offensive = {0: "hate_speech", 1: "offensive", 2: "neither"}
df_hate_offensive_train["label_text"] = df_hate_offensive_train["class"].map(label_text_map_hate_offensive)

df_hate_offensive_train["label_standard"] = df_hate_offensive_train.label_text.factorize(sort=True)[0]

df_hate_offensive_train = df_hate_offensive_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

# highly imbalanced - downsample
print("Label distribution in dataset before downsampling:\n", df_hate_offensive_train.label_text.value_counts())
max_per_label = 2000
df_hate_offensive_train = df_hate_offensive_train.groupby("label_text", as_index=False, group_keys=False).apply(lambda x: x.sample(n=min(max_per_label, len(x)), random_state=SEED_GLOBAL)).reset_index(drop=True)

print("Label distribution in dataset:\n", df_hate_offensive_train.label_text.value_counts())
display(df_hate_offensive_train)


In [None]:
## prepare df_test
# no test set
from sklearn.model_selection import train_test_split

df_hate_offensive_train_split, df_hate_offensive_test_split = train_test_split(
    df_hate_offensive_train, test_size=0.2, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_hate_offensive_train["label_text"]
)
print(len(df_hate_offensive_train_split))
print(len(df_hate_offensive_test_split))


In [None]:
# save harmonized data to disk
dataset_name = "hateoffensive"
df_hate_offensive_train_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_hate_offensive_test_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')


#### HateXplain dataset

In [None]:
from datasets import load_dataset
import pandas as pd

dataset_hatexplain = load_dataset("hatexplain")

# merge train and val. not doing hp tuning
df_hatexplain_train = pd.concat([
    dataset_hatexplain["train"].to_pandas(),
    dataset_hatexplain["validation"].to_pandas()
])
df_hatexplain_test = dataset_hatexplain["test"].to_pandas()


In [None]:
import ast

# extract only "labels" from dictionary in each cell
df_hatexplain_train['label_original'] = df_hatexplain_train['annotators'].apply(lambda x: x['label'])
df_hatexplain_test['label_original'] = df_hatexplain_test['annotators'].apply(lambda x: x['label'])

# get consensus label
def get_unique_value_if_all_same(x):
    if all(item == x[0] for item in x):
        return int(x[0])
    else:
        return None

df_hatexplain_train['label_original'] = df_hatexplain_train['label_original'].apply(get_unique_value_if_all_same)
df_hatexplain_test['label_original'] = df_hatexplain_test['label_original'].apply(get_unique_value_if_all_same)

print(len(df_hatexplain_train))
df_hatexplain_train = df_hatexplain_train[~df_hatexplain_train['label_original'].isna()]
df_hatexplain_test = df_hatexplain_test[~df_hatexplain_test['label_original'].isna()]
print(len(df_hatexplain_train))

# join text
df_hatexplain_train["text"] = df_hatexplain_train["post_tokens"].apply(lambda x: " ".join(x))
df_hatexplain_test["text"] = df_hatexplain_test["post_tokens"].apply(lambda x: " ".join(x))

# label map
label_text_map_hatexplain = {
    0: "hate_speech",
    1: "neither",
    2: "offensive",
}
df_hatexplain_train["label_text"] = df_hatexplain_train["label_original"].map(
    label_text_map_hatexplain
)
df_hatexplain_test["label_text"] = df_hatexplain_test["label_original"].map(
    label_text_map_hatexplain
)

df_hatexplain_train["label_standard"] = df_hatexplain_train.label_text.factorize(sort=True)[0]
df_hatexplain_test["label_standard"] = df_hatexplain_test.label_text.factorize(sort=True)[0]


# clean
df_hatexplain_train = df_hatexplain_train[[
    "text", "label_text", "label_standard", #"id", #"annotators", #"rationales", "post_tokens"
]].reset_index(drop=True)
df_hatexplain_test = df_hatexplain_test[[
    "text", "label_text", "label_standard", #"id", #"annotators", #"rationales", "post_tokens"
]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_hatexplain_train.label_text.value_counts())
display(df_hatexplain_train)

In [None]:
# save harmonized data to disk
dataset_name = "hatexplain"
df_hatexplain_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_hatexplain_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### SMS Spam dataset

In [None]:
dataset_spam = load_dataset("sms_spam")
print("Raw dataset structure:\n", dataset_spam)

## prepare df_train
df_spam_train = dataset_spam["train"].to_pandas()

label_text_map_spam = {0: "not_spam", 1: "spam"}
df_spam_train["label_text"] = df_spam_train.label.map(label_text_map_spam)

df_spam_train = df_spam_train.rename(columns={"sms": "text"})
df_spam_train = df_spam_train[~df_spam_train.text.duplicated()]

df_spam_train["label_standard"] = df_spam_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_spam_train))
n_data_per_label = 10_000
df_spam_train = df_spam_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_spam_train))

# final harmonized format
df_spam_train = df_spam_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_spam_train.label_text.value_counts())
display(df_spam_train)



In [None]:
## prepare df_test
# no test set
from sklearn.model_selection import train_test_split

df_spam_train_split, df_spam_test_split = train_test_split(
    df_spam_train, test_size=0.2, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_spam_train["label_text"]
)
print(len(df_spam_train_split))
print(len(df_spam_test_split))


In [None]:
# save harmonized data to disk
dataset_name = "spam"
df_spam_train_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_spam_test_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')


#### MASSIVE intent dataset

In [None]:
dataset_massive = load_dataset("AmazonScience/massive", "en-US")
print("Raw dataset structure:\n", dataset_massive)

# has two potential labels: scenario and intent. choosing the more fine-grained "intent"
label_mapping_massive = {
    idx: name for idx, name in enumerate(dataset_massive["train"].features["intent"].names)
}

## prepare df_train
df_massive_train = pd.concat([
    dataset_massive["train"].to_pandas(),
    dataset_massive["validation"].to_pandas()
])

df_massive_train["label_text"] = df_massive_train["intent"].map(label_mapping_massive)

df_massive_train["label_standard"] = df_massive_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_massive_train))
n_data_per_label = 10_000
df_massive_train = df_massive_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_massive_train))

# final harmonized format
df_massive_train = df_massive_train.rename(columns={"utt": "text"})
df_massive_train = df_massive_train[~df_massive_train.text.duplicated()]

df_massive_train = df_massive_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_massive_train.label_text.value_counts())
display(df_massive_train)



In [None]:
## prepare df_test
df_massive_test = dataset_massive["test"].to_pandas()

df_massive_test["label_text"] = df_massive_test["intent"].map(label_mapping_massive)
df_massive_test["label_standard"] = df_massive_test.label_text.factorize(sort=True)[0]

df_massive_test = df_massive_test.rename(columns={"utt": "text"})
df_massive_test = df_massive_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_massive_test)

In [None]:
# save harmonized data to disk
dataset_name = "massive"
df_massive_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_massive_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### Banking77 dataset

In [None]:
dataset_banking77 = load_dataset("PolyAI/banking77")
print("Raw dataset structure:\n", dataset_banking77)

label_mapping_banking77 = {
    idx: name for idx, name in enumerate(dataset_banking77["train"].features["label"].names)
}

## prepare df_train
df_banking77_train = dataset_banking77["train"].to_pandas()

df_banking77_train["label_text"] = df_banking77_train.label.map(label_mapping_banking77)

df_banking77_train = df_banking77_train[~df_banking77_train.text.duplicated()]

df_banking77_train["label_standard"] = df_banking77_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_banking77_train))
n_data_per_label = 10_000
df_banking77_train = df_banking77_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_banking77_train))

df_banking77_train = df_banking77_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_banking77_train.label_text.value_counts())
display(df_banking77_train)


In [None]:
## prepare df_test
df_banking77_test = dataset_banking77["test"].to_pandas()
df_banking77_test["label_text"] = df_banking77_test.label.map(label_mapping_banking77)
df_banking77_test["label_standard"] = df_banking77_test.label_text.factorize(sort=True)[0]
df_banking77_test = df_banking77_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_banking77_test)

In [None]:
# save harmonized data to disk
dataset_name = "banking77"
df_banking77_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_banking77_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### emotion6 dataset

In [None]:
dataset_emotion_dair = load_dataset("dair-ai/emotion")
print("Raw dataset structure:\n", dataset_emotion_dair)

label_mapping_emotion_dair = {
    idx: name for idx, name in enumerate(dataset_emotion_dair["train"].features["label"].names)
}

## prepare df_train
df_emotion_dair_train = pd.concat([
    dataset_emotion_dair["train"].to_pandas(),
    dataset_emotion_dair["validation"].to_pandas()
])
#df_emotion_dair_train = dataset_emotion_dair["train"].to_pandas()

df_emotion_dair_train["label_text"] = df_emotion_dair_train.label.map(label_mapping_emotion_dair)

df_emotion_dair_train = df_emotion_dair_train[~df_emotion_dair_train.text.duplicated()]

df_emotion_dair_train["label_standard"] = df_emotion_dair_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_emotion_dair_train))
n_data_per_label = 10_000
df_emotion_dair_train = df_emotion_dair_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_emotion_dair_train))


df_emotion_dair_train = df_emotion_dair_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_emotion_dair_train.label_text.value_counts())
display(df_emotion_dair_train)


In [None]:
## prepare df_test
df_emotion_dair_test = dataset_emotion_dair["test"].to_pandas()
df_emotion_dair_test["label_text"] = df_emotion_dair_test.label.map(label_mapping_emotion_dair)
df_emotion_dair_test["label_standard"] = df_emotion_dair_test.label_text.factorize(sort=True)[0]
df_emotion_dair_test = df_emotion_dair_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_emotion_dair_test)

In [None]:
# save harmonized data to disk
dataset_name = "emotiondair"
df_emotion_dair_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_emotion_dair_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### EmoContext dataset

In [None]:
dataset_emocontext = load_dataset("emo")

label_mapping_emocontext = {
    idx: name for idx, name in enumerate(dataset_emocontext["train"].features["label"].names)
}

## prepare df_train
df_emocontext_train = dataset_emocontext["train"].to_pandas()

df_emocontext_train["label_text"] = df_emocontext_train.label.map(label_mapping_emocontext)

df_emocontext_train = df_emocontext_train[~df_emocontext_train.text.duplicated()]

df_emocontext_train["label_standard"] = df_emocontext_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_emocontext_train))
n_data_per_label = 10_000
df_emocontext_train = df_emocontext_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_emocontext_train))

df_emocontext_train = df_emocontext_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_emocontext_train.label_text.value_counts())
display(df_emocontext_train)


In [None]:
## prepare df_test
df_emocontext_test = dataset_emocontext["test"].to_pandas()
df_emocontext_test["label_text"] = df_emocontext_test.label.map(label_mapping_emocontext)
df_emocontext_test["label_standard"] = df_emocontext_test.label_text.factorize(sort=True)[0]
df_emocontext_test = df_emocontext_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_emocontext_test)

In [None]:
# save harmonized data to disk
dataset_name = "emocontext"
df_emocontext_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_emocontext_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### Empathetic Dialogue dataset

In [None]:
# ! Challenge with dataset: is effectively multi-label. NLI augmentation can lead to errors
# strongly cleaning and downsampling to reduce risk

from datasets import load_dataset

dataset_empathetic = load_dataset("empathetic_dialogues")
print(dataset_empathetic)

df_empathetic_train = pd.concat([
    dataset_empathetic["train"].to_pandas(),
    dataset_empathetic["validation"].to_pandas()
])

def merge_dialogues(example):
    dialogue = "Context: " + example["prompt"].iloc[0]
    first_speaker = True
    for utterance in example["utterance"].to_list():
        if first_speaker:
            dialogue += f"\nSpeaker 1: {utterance}"
        else:
            dialogue += f"\nSpeaker 2: {utterance}"
        first_speaker = not first_speaker

    dialogue = dialogue.replace("_comma_", ",")

    return pd.Series({"text": dialogue, "label_text": example["context"].iloc[0]})


df_empathetic_train = df_empathetic_train.groupby(by="conv_id", as_index=False, group_keys=False).apply(
    lambda x: merge_dialogues(x)
)
df_empathetic_train = df_empathetic_train[~df_empathetic_train.text.duplicated()]
df_empathetic_train = df_empathetic_train.drop(columns=["conv_id"])
df_empathetic_train = df_empathetic_train.reset_index(drop=True)

df_empathetic_train["label_standard"] = df_empathetic_train.label_text.factorize(sort=True)[0]

print("Label distribution in dataset:\n", df_empathetic_train.label_text.value_counts())

display(df_empathetic_train)


In [None]:
df_empathetic_test = dataset_empathetic["test"].to_pandas()

df_empathetic_test = df_empathetic_test.groupby(by="conv_id", as_index=False, group_keys=False).apply(
    lambda x: merge_dialogues(x)
)

df_empathetic_test = df_empathetic_test[~df_empathetic_test.text.duplicated()]
df_empathetic_test = df_empathetic_test.drop(columns=["conv_id"])
df_empathetic_test = df_empathetic_test.reset_index(drop=True)

df_empathetic_test["label_standard"] = df_empathetic_test.label_text.factorize(sort=True)[0]

display(df_empathetic_test)

In [None]:
# save harmonized data to disk
dataset_name = "empathetic"
df_empathetic_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_empathetic_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### agnews dataset

In [None]:
dataset_agnews = load_dataset("ag_news")

label_mapping_agnews = {
    idx: name for idx, name in enumerate(dataset_agnews["train"].features["label"].names)
}

## prepare df_train
df_agnews_train = pd.concat([
    dataset_agnews["train"].to_pandas(),
    dataset_agnews["test"].to_pandas()
])
#df_agnews_train = dataset_agnews["train"].to_pandas()

df_agnews_train["label_text"] = df_agnews_train.label.map(label_mapping_agnews)

df_agnews_train = df_agnews_train[~df_agnews_train.text.duplicated()]

df_agnews_train["label_standard"] = df_agnews_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_agnews_train))
n_data_per_label = 10_000
df_agnews_train = df_agnews_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length before downsampling: ", len(df_agnews_train))


df_agnews_train = df_agnews_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_agnews_train.label_text.value_counts())
display(df_agnews_train)


In [None]:
## prepare df_test
df_agnews_test = dataset_agnews["test"].to_pandas()
df_agnews_test["label_text"] = df_agnews_test.label.map(label_mapping_agnews)
df_agnews_test["label_standard"] = df_agnews_test.label_text.factorize(sort=True)[0]
df_agnews_test = df_agnews_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_agnews_test)

In [None]:
# save harmonized data to disk
dataset_name = "agnews"
df_agnews_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_agnews_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### Yahoo answers topics

In [None]:
dataset_yahoo_topics = load_dataset("yahoo_answers_topics")

label_mapping_yahoo_topics = {
    idx: name for idx, name in enumerate(dataset_yahoo_topics["train"].features["topic"].names)
}

df_yahoo_topics_train = dataset_yahoo_topics["train"].to_pandas()

df_yahoo_topics_train = df_yahoo_topics_train.rename(columns={"topic": "label"})
df_yahoo_topics_train["text"] = "Question: " + df_yahoo_topics_train["question_title"] + " " + df_yahoo_topics_train["question_content"] + "\n\n" + "Answer: " + df_yahoo_topics_train["best_answer"]

df_yahoo_topics_train["label_text"] = df_yahoo_topics_train.label.map(label_mapping_yahoo_topics)
df_yahoo_topics_train = df_yahoo_topics_train[~df_yahoo_topics_train.text.duplicated()]
df_yahoo_topics_train["label_standard"] = df_yahoo_topics_train.label_text.factorize(sort=True)[0]

# too large, downsample
n_data_per_label = 10_000
print("Dataset length before downsampling: ", len(df_yahoo_topics_train))
df_yahoo_topics_train = df_yahoo_topics_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length after downsampling: ", len(df_yahoo_topics_train))


df_yahoo_topics_train = df_yahoo_topics_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_yahoo_topics_train.label_text.value_counts())
display(df_yahoo_topics_train)


In [None]:
## prepare df_test
df_yahoo_topics_test = dataset_yahoo_topics["test"].to_pandas()
df_yahoo_topics_test = df_yahoo_topics_test.rename(columns={"topic": "label"})
df_yahoo_topics_test["text"] = "Question: " + df_yahoo_topics_test["question_title"] + " " + df_yahoo_topics_test["question_content"] + "\n\n" + "Answer: " + df_yahoo_topics_test["best_answer"]

df_yahoo_topics_test["label_text"] = df_yahoo_topics_test.label.map(label_mapping_yahoo_topics)
df_yahoo_topics_test["label_standard"] = df_yahoo_topics_test.label_text.factorize(sort=True)[0]

df_yahoo_topics_test = df_yahoo_topics_test[["text", "label_text", "label_standard"]].reset_index(drop=True)
display(df_yahoo_topics_test)

In [None]:
# save harmonized data to disk
dataset_name = "yahootopics"
df_yahoo_topics_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_yahoo_topics_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

#### True Teacher dataset

In [None]:
# TODO: later train longer context model with non-shortened data

In [None]:
## download data
# paper: http://arxiv.org/abs/2305.11171
import pandas as pd
import zipfile
import requests
from io import BytesIO

# URL of the zip file
url = "https://storage.googleapis.com/gresearch/true_teacher/true_teacher_data.zip"

# Make a GET request to fetch the raw HTML content
content = requests.get(url)

# Open the content in memory
zf = zipfile.ZipFile(BytesIO(content.content))

# Assume that the JSON file is the first file in the zip
print(zf.namelist())
json_file_name_lst = ['true_teacher_data/t5-large.jsonl', 'true_teacher_data/t5-small.jsonl', 'true_teacher_data/t5-base.jsonl', 'true_teacher_data/t5-3b.jsonl', 'true_teacher_data/t5-11b.jsonl']

# Open the JSON file
df_trueteacher_lst = []
for json_file_name in json_file_name_lst:
    with zf.open(json_file_name) as f:
        df_step = pd.read_json(f, lines=True)
        df_step["model"] = json_file_name.split("/")[1].split(".")[0]
        df_trueteacher_lst.append(df_step)

df_trueteacher = pd.concat(df_trueteacher_lst)

label_text_map_trueteacher = {0: "factually_inconsistent", 1: "factually_consistent"}
df_trueteacher["label_text"] = df_trueteacher.label.map(label_text_map_trueteacher)

df_trueteacher.head()


In [None]:
## merge with cnndm dataset to get full texts
from datasets import load_dataset

dataset_cnndm = load_dataset("cnn_dailymail", '3.0.0')

df_cnndm_train = pd.DataFrame({
    "cnndm_id": dataset_cnndm["train"]["id"],
    "article": dataset_cnndm["train"]["article"]
})

df_trueteacher_full = pd.merge(df_trueteacher, df_cnndm_train, on="cnndm_id", how="inner")
print(len(df_trueteacher_full))


In [None]:
## True teacher does not seem to use cnndm test set
# not sure how they created test set
#df_cnndm_test = pd.DataFrame({"cnndm_id": dataset_cnndm["test"]["id"], "article": dataset_cnndm["test"]["article"]})
#df_trueteacher_test = pd.merge(df_trueteacher, df_cnndm_test, on="cnndm_id", how="inner")
#df_cnndm_validation = pd.DataFrame({"cnndm_id": dataset_cnndm["validation"]["id"], "article": dataset_cnndm["validation"]["article"]})
#df_trueteacher_validation = pd.merge(df_trueteacher, df_cnndm_validation, on="cnndm_id", how="inner")
#df_cnndm_validation.head()
#print(len(df_trueteacher_test))
#print(len(df_trueteacher_validation))

In [None]:
# remove too long texts
print("Text length distribution:\n", df_trueteacher_full.article.str.len().value_counts(bins=10))

max_characters = 512 * 3
df_trueteacher_short = df_trueteacher_full[(df_trueteacher_full.article.str.len() + df_trueteacher_full.summary.str.len()) < max_characters]
label_distribution = df_trueteacher_short.label.value_counts()
print("Raw label distribution (after truncation):\n", label_distribution)

# sample for label balance
df_trueteacher_short_samp = df_trueteacher_short.groupby(
    "label", group_keys=False, as_index=False
    ).apply(
        lambda x: x.sample(n=label_distribution.min(), random_state=SEED_GLOBAL)
    )
# shuffle
df_trueteacher_short_samp = df_trueteacher_short_samp.sample(
    n=len(df_trueteacher_short_samp), random_state=SEED_GLOBAL
)
print("Downsampled balanced label distribution:\n", df_trueteacher_short_samp.label.value_counts())

# input text formatting
df_trueteacher_short_samp["text"] = "Summary:\n" + df_trueteacher_short_samp.summary + "\n\nFull article:\n" + df_trueteacher_short_samp.article

print("Number of texts before deduplication: ", len(df_trueteacher_short_samp))
df_trueteacher_short_samp = df_trueteacher_short_samp[~df_trueteacher_short_samp.text.duplicated()]
print("Number of texts after deduplication: ", len(df_trueteacher_short_samp))

df_trueteacher_short_samp["label_standard"] = df_trueteacher_short_samp.label_text.factorize(sort=True)[0]

# final format
df_trueteacher_train = df_trueteacher_short_samp[["text", "label_text", "label_standard"]].reset_index(drop=True).copy(deep=True)

display(df_trueteacher_train.head())

In [None]:
## prepare df_test
# no test set
from sklearn.model_selection import train_test_split

df_trueteacher_train_split, df_trueteacher_test_split = train_test_split(
    df_trueteacher_train, test_size=0.2, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_trueteacher_train["label_text"]
)
print(len(df_trueteacher_train_split))
print(len(df_trueteacher_test_split))


In [None]:
# save harmonized data to disk
dataset_name = "trueteacher"
df_trueteacher_train_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_trueteacher_test_split.reset_index(drop=True).to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')


#### Anthropic RLHF datasets

In [None]:
# TODO: later train longer context model with non-shortened data

In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

# dataset: https://huggingface.co/datasets/Anthropic/hh-rlhf
# paper: https://arxiv.org/pdf/2204.05862.pdf
# subsets described on p. 12
dataset_anthropic_rlhf_helpful_base = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")
dataset_anthropic_rlhf_helpful_rs = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-rejection-sampled")
dataset_anthropic_rlhf_helpful_online = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-online")
dataset_anthropic_rlhf_harmless = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base")
# not meant for preference modelling, different format, see: https://huggingface.co/datasets/Anthropic/hh-rlhf
#dataset_anthropic_rlhf_redteam = load_dataset("Anthropic/hh-rlhf", data_dir="red-team-attempts")

dataset_anthropic_rlhf_helpful = DatasetDict({
    "train": concatenate_datasets([
        dataset_anthropic_rlhf_helpful_base["train"],
        dataset_anthropic_rlhf_helpful_rs["train"],
        dataset_anthropic_rlhf_helpful_online["train"]
    ]),
    "test": concatenate_datasets([
        dataset_anthropic_rlhf_helpful_base["test"],
        dataset_anthropic_rlhf_helpful_rs["test"],
        dataset_anthropic_rlhf_helpful_online["test"]
    ])
})


## specific dataset characteristics:
# the same conversation can appear multiple times, only that new X conversation turns have been added
#df_testing = dataset_anthropic_rlhf_redteam["train"].to_pandas()[:100]
#df_testing = df_testing[df_testing["chosen"].str.contains("Human: How can I store food if I don't hav")]

print("dataset helpful: ", dataset_anthropic_rlhf_helpful)
print("dataset harmless: ", dataset_anthropic_rlhf_harmless)


In [None]:
## tailored prompt
import random


def prompt_format_anthropic_rlhf(example, helpful_or_harmless=None, make_prompt_true=True):
    np.random.seed(None)
    chosen_first = random.choice([True, False])
    positive_prompt = random.choice([True, False])

    def prompt_format_correct(example, helpful_or_harmless=helpful_or_harmless, chosen_first=None, positive_prompt=None):
        if chosen_first and positive_prompt:
            #prompt_true = f"""First conversation:\n{example["chosen"]}\n\nSecond conversation:\n{example["rejected"]}
            #\nThe assistant is more {helpful_or_harmless} in the first conversation.
            #"""
            text = f"""First conversation:\n{example["chosen"].strip()}\n\nSecond conversation:\n{example["rejected"].strip()}"""
            hypothesis = f"The assistant is more {helpful_or_harmless} in the first conversation."
        elif chosen_first and not positive_prompt:
            #prompt_true = f"""First conversation:\n{example["chosen"]}\n\nSecond conversation:\n{example["rejected"]}
            #\nThe assistant is less {helpful_or_harmless} in the second conversation.
            #"""
            text = f"""First conversation:\n{example["chosen"].strip()}\n\nSecond conversation:\n{example["rejected"].strip()}"""
            hypothesis = f"The assistant is less {helpful_or_harmless} in the second conversation."
        elif not chosen_first and positive_prompt:
            #prompt_true = f"""First conversation:\n{example["rejected"]}\n\nSecond conversation:\n{example["chosen"]}
            #\nThe assistant is more {helpful_or_harmless} in the second conversation.
            #"""
            text = f"""First conversation:\n{example["rejected"].strip()}\n\nSecond conversation:\n{example["chosen"].strip()}"""
            hypothesis = f"The assistant is more {helpful_or_harmless} in the second conversation."
        elif not chosen_first and not positive_prompt:
            #prompt_true = f"""First conversation:\n{example["rejected"]}\n\nSecond conversation:\n{example["chosen"]}
            #\nThe assistant is less {helpful_or_harmless} in the first conversation.
            #"""
            text = f"""First conversation:\n{example["rejected"].strip()}\n\nSecond conversation:\n{example["chosen"].strip()}"""
            hypothesis = f"The assistant is less {helpful_or_harmless} in the first conversation."
        return text, hypothesis

    def prompt_format_wrong(example, helpful_or_harmless=helpful_or_harmless, chosen_first=None, positive_prompt=None):
        #simply flipped sequence of conversations
        if chosen_first and positive_prompt:
            #prompt_true = f"""First conversation:\n{example["rejected"]}\n\nSecond conversation:\n{example["chosen"]}
            #\nThe assistant is more {helpful_or_harmless} in the first conversation.
            #"""
            text = f"""First conversation:\n{example["rejected"].strip()}\n\nSecond conversation:\n{example["chosen"].strip()}"""
            hypothesis = f"The assistant is more {helpful_or_harmless} in the first conversation."
        elif chosen_first and not positive_prompt:
            #prompt_true = f"""First conversation:\n{example["rejected"]}\n\nSecond conversation:\n{example["chosen"]}
            #\nThe assistant is less {helpful_or_harmless} in the second conversation.
            #"""
            text = f"""First conversation:\n{example["rejected"].strip()}\n\nSecond conversation:\n{example["chosen"].strip()}"""
            hypothesis = f"The assistant is less {helpful_or_harmless} in the second conversation."
        elif not chosen_first and positive_prompt:
            #prompt_true = f"""First conversation:\n{example["chosen"]}\n\nSecond conversation:\n{example["rejected"]}
            #\nThe assistant is more {helpful_or_harmless} in the second conversation.
            #"""
            text = f"""First conversation:\n{example["chosen"].strip()}\n\nSecond conversation:\n{example["rejected"].strip()}"""
            hypothesis = f"The assistant is more {helpful_or_harmless} in the second conversation."
        elif not chosen_first and not positive_prompt:
            #prompt_true = f"""First conversation:\n{example["chosen"]}\n\nSecond conversation:\n{example["rejected"]}
            #\nThe assistant is less {helpful_or_harmless} in the first conversation.
            #"""
            text = f"""First conversation:\n{example["chosen"].strip()}\n\nSecond conversation:\n{example["rejected"].strip()}"""
            hypothesis = f"The assistant is less {helpful_or_harmless} in the first conversation."
        return text, hypothesis

    # create prompt
    if make_prompt_true:
        text, hypothesis = prompt_format_correct(example, helpful_or_harmless=helpful_or_harmless, chosen_first=chosen_first, positive_prompt=positive_prompt)
    elif not make_prompt_true:
        text, hypothesis = prompt_format_wrong(example, helpful_or_harmless=helpful_or_harmless, chosen_first=chosen_first, positive_prompt=positive_prompt)

    label = 0 if make_prompt_true else 1

    return {"text": text, "labels": label, "label_text": hypothesis, "hypothesis": hypothesis}


# for each dataset (helpful & harmless) create two rows per text: one with correct label/hypothesis and one with false
# helpful dataset
dataset_anthropic_rlhf_helpful_true = dataset_anthropic_rlhf_helpful.map(
    lambda example: prompt_format_anthropic_rlhf(
        example, helpful_or_harmless="helpful and honest",
        make_prompt_true=True
    )
)
dataset_anthropic_rlhf_helpful_false = dataset_anthropic_rlhf_helpful.map(
    lambda example: prompt_format_anthropic_rlhf(
        example, helpful_or_harmless="helpful and honest",
        make_prompt_true=False
    )
)

dataset_anthropic_rlhf_helpful_train = concatenate_datasets([dataset_anthropic_rlhf_helpful_true["train"], dataset_anthropic_rlhf_helpful_false["train"]])
dataset_anthropic_rlhf_helpful_test = concatenate_datasets([dataset_anthropic_rlhf_helpful_true["test"], dataset_anthropic_rlhf_helpful_false["test"]])
dataset_anthropic_rlhf_helpful = DatasetDict({"train": dataset_anthropic_rlhf_helpful_train, "test": dataset_anthropic_rlhf_helpful_test})

#dataset_anthropic_rlhf_helpful = dataset_anthropic_rlhf_helpful.remove_columns(["chosen", "rejected"])
dataset_anthropic_rlhf_helpful = dataset_anthropic_rlhf_helpful.shuffle(seed=SEED_GLOBAL)

# harmless dataset
dataset_anthropic_rlhf_harmless_true = dataset_anthropic_rlhf_harmless.map(
    lambda example: prompt_format_anthropic_rlhf(
        example, helpful_or_harmless="harmless",
        make_prompt_true=True
    )
)
dataset_anthropic_rlhf_harmless_false = dataset_anthropic_rlhf_harmless.map(
    lambda example: prompt_format_anthropic_rlhf(
        example, helpful_or_harmless="harmless",
        make_prompt_true=False
    )
)
dataset_anthropic_rlhf_harmless_train = concatenate_datasets([dataset_anthropic_rlhf_harmless_true["train"], dataset_anthropic_rlhf_harmless_false["train"]])
dataset_anthropic_rlhf_harmless_test = concatenate_datasets([dataset_anthropic_rlhf_harmless_true["test"], dataset_anthropic_rlhf_harmless_false["test"]])
dataset_anthropic_rlhf_harmless = DatasetDict({"train": dataset_anthropic_rlhf_harmless_train, "test": dataset_anthropic_rlhf_harmless_test})

dataset_anthropic_rlhf_harmless = dataset_anthropic_rlhf_harmless.shuffle(seed=SEED_GLOBAL)

# inspect results
print("Label distribution harmless\n", dataset_anthropic_rlhf_harmless["train"].to_pandas().labels.value_counts(), "\n")
print("Label distribution helpful\n", dataset_anthropic_rlhf_helpful["train"].to_pandas().labels.value_counts(), "\n")

display(dataset_anthropic_rlhf_harmless["train"].to_pandas().head(100))


In [None]:
# shorten datasets if too long
print("Length harmless before shortening: ", dataset_anthropic_rlhf_harmless)
print("Length helpful before shortening: ", dataset_anthropic_rlhf_helpful)

max_characters = 512 * 3
dataset_anthropic_rlhf_harmless = dataset_anthropic_rlhf_harmless.filter(lambda x: len(x["text"]) >= 100 and len(x["text"]) <= max_characters)
dataset_anthropic_rlhf_helpful = dataset_anthropic_rlhf_helpful.filter(lambda x: len(x["text"]) >= 100 and len(x["text"]) <= max_characters)

print("\nLength harmless after shortening: ", dataset_anthropic_rlhf_harmless)
print("Length helpful after shortening: ", dataset_anthropic_rlhf_helpful)


In [None]:
# save harmonized data to disk
dataset_anthropic_rlhf_harmless["train"].to_pandas().to_parquet(f"./datasets_standardized/ds_anthropic_harmless_train.gzip", compression='gzip')
dataset_anthropic_rlhf_harmless["test"].to_pandas().to_parquet(f"./datasets_standardized/ds_anthropic_harmless_test.gzip", compression='gzip')

dataset_anthropic_rlhf_helpful["train"].to_pandas().to_parquet(f"./datasets_standardized/ds_anthropic_helpful_train.gzip", compression='gzip')
dataset_anthropic_rlhf_helpful["test"].to_pandas().to_parquet(f"./datasets_standardized/ds_anthropic_helpful_test.gzip", compression='gzip')

### New Dataset template
Below is copy-paste able code for adding new datasets. Datasets may beed some custom wrangling/cleaning, so make sure to inspect the data and adapt the code.

In [None]:
assert 1 == 2, "Block following code from executing when running entire notebook"

In [None]:
dataset_XXX = load_dataset("XXX")
print("Raw dataset structure:\n", dataset_XXX)

label_mapping_XXX = {
    idx: name for idx, name in enumerate(dataset_XXX["train"].features["label"].names)
}

## prepare df_train
df_XXX_train = pd.concat([
    dataset_XXX["train"].to_pandas(),
    dataset_XXX["validation"].to_pandas()
])

df_XXX_train["label_text"] = df_XXX_train.label.map(label_mapping_XXX)

df_XXX_train = df_XXX_train[~df_XXX_train.text.duplicated()]

df_XXX_train["label_standard"] = df_XXX_train.label_text.factorize(sort=True)[0]

# downsample if too large
print("Dataset length before downsampling: ", len(df_XXX_train))
n_data_per_label = 10_000
df_XXX_train = df_XXX_train.groupby("label_text", as_index=False, group_keys=False).apply(
    lambda x: x.sample(min(n_data_per_label, len(x)), random_state=SEED_GLOBAL)
)
print("Dataset length after downsampling: ", len(df_XXX_train))

# final harmonized format
df_XXX_train = df_XXX_train[["text", "label_text", "label_standard"]].reset_index(drop=True)

print("Label distribution in dataset:\n", df_XXX_train.label_text.value_counts())
display(df_XXX_train)



In [None]:
## prepare df_test
df_XXX_test = dataset_XXX["test"].to_pandas()
df_XXX_test["label_text"] = df_XXX_test.label.map(label_mapping_XXX)
df_XXX_test["label_standard"] = df_XXX_test.label_text.factorize(sort=True)[0]
df_XXX_test = df_XXX_test[["text", "label_text", "label_standard"]].reset_index(drop=True)

display(df_XXX_test)

In [None]:
# save harmonized data to disk
dataset_name = "XXX"
df_XXX_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_XXX_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')