In [34]:
from collections import Counter
from pathlib import Path

import datasets

In [37]:
def subset_from_csv(file_path, rename_noemo=None):
    subset = {"category": [], "text": []}

    with open(file_path) as f:
        for line in f:
            class_, text = line.split('\t', 1)
            if rename_noemo is not None and class_ == "noemo":
                class_ = rename_noemo
            subset["category"].append(class_)
            subset["text"].append(text)

    return subset

def build_dataset_emotion(data_path, train_fname, rename_noemo=None):
    data_path = Path(data_path)
    
    train_set = subset_from_csv(data_path/train_fname, rename_noemo=rename_noemo)
    valid_set = subset_from_csv(data_path/"dev.txt", rename_noemo=rename_noemo)
    test_set = subset_from_csv(data_path/"test.txt", rename_noemo=rename_noemo)

    dataset = datasets.DatasetDict({
        "train": datasets.Dataset.from_dict(train_set),
        "validation": datasets.Dataset.from_dict(valid_set),
        "test": datasets.Dataset.from_dict(test_set),    
    })
    return dataset

# Emotion detection

In [31]:
data_path = Path("../data/BenchmarkingZeroShot/emotion")
dataset = build_dataset_emotion(data_path, train_fname="train_pu_half_v0.txt")
Counter(dataset["train"]["category"])

Counter({'fear': 8612,
         'shame': 696,
         'sadness': 6703,
         'anger': 2133,
         'love': 2320})

In [32]:
Counter(dataset["validation"]["category"])

Counter({'fear': 1600,
         'joy': 1600,
         'sadness': 1300,
         'shame': 100,
         'guilt': 100,
         'disgust': 300,
         'anger': 800,
         'noemo': 1000,
         'surprise': 500,
         'love': 400})

In [49]:
dataset.save_to_disk("../data/emotion_v0")

In [29]:
data_path = Path("../data/BenchmarkingZeroShot/emotion")
dataset = build_dataset_emotion(data_path, train_fname="train_pu_half_v1.txt")
Counter(dataset["train"]["category"])

Counter({'joy': 11166, 'guilt': 693, 'disgust': 848, 'surprise': 1496})

In [30]:
dataset.save_to_disk("../data/emotion_v1")

In [38]:
data_path = Path("../data/BenchmarkingZeroShot/emotion")
dataset = build_dataset_emotion(data_path, train_fname="train_pu_half_v0.txt", rename_noemo="toneless")
Counter(dataset["train"]["category"])

Counter({'fear': 8612,
         'shame': 696,
         'sadness': 6703,
         'anger': 2133,
         'love': 2320})

In [39]:
dataset.save_to_disk("../data/emotion_v0_toneless")

In [40]:
data_path = Path("../data/BenchmarkingZeroShot/emotion")
dataset = build_dataset_emotion(data_path, train_fname="train_pu_half_v1.txt", rename_noemo="toneless")
Counter(dataset["train"]["category"])

Counter({'joy': 11166, 'guilt': 693, 'disgust': 848, 'surprise': 1496})

In [41]:
dataset.save_to_disk("../data/emotion_v1_toneless")

# Situation detection (single-label-only)

In the end, you should have *11* classes (??)

In [19]:
def filter_out_multilabel_examples(classes, examples):
    filtered_classes = []
    filtered_examples = []
    
    for c, e in zip(classes, examples):
        if " " in c: continue
        filtered_classes.append(c)
        filtered_examples.append(e)
    
    return filtered_classes, filtered_examples

In [36]:
new_c, new_e = filter_out_multilabel_examples(dataset["train"]["category"], dataset["train"]["text"])
new_subset = {"category": new_c, "text": new_e}
Counter(new_subset["category"])

Counter({'shelter': 399,
         'food': 601,
         'utils': 271,
         'terrorism': 338,
         'evac': 137})

In [37]:
new_dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_dict(new_subset)
})
new_dataset

DatasetDict({
    train: Dataset({
        features: ['category', 'text'],
        num_rows: 1746
    })
})

# Topic detection

In [13]:
with open("../data/BenchmarkingZeroShot/topic/classes.txt") as f:
    id2class = [x.strip('\n') for x in f]

id2class

['Society & Culture',
 'Science & Mathematics',
 'Health',
 'Education & Reference',
 'Computers & Internet',
 'Sports',
 'Business & Finance',
 'Entertainment & Music',
 'Family & Relationships',
 'Politics & Government']

In [17]:
def subset_from_csv_with_id2class(file_path, id2class):
    subset = {"category": [], "text": []}

    with open(file_path) as f:
        for line in f:
            class_, text = line.split('\t', 1)
            subset["category"].append(id2class[int(class_)])
            subset["text"].append(text)

    return subset

In [18]:
def build_dataset_topic(data_path):
    data_path = Path(data_path)

    with open(data_path/"classes.txt") as f:
        id2class = [x.strip('\n') for x in f]

    train_set = subset_from_csv_with_id2class(data_path/"train_pu_half_v0.txt", id2class)
    valid_set = subset_from_csv_with_id2class(data_path/"dev.txt", id2class)
    test_set = subset_from_csv_with_id2class(data_path/"test.txt", id2class)

    dataset = datasets.DatasetDict({
        "train": datasets.Dataset.from_dict(train_set),
        "validation": datasets.Dataset.from_dict(valid_set),
        "test": datasets.Dataset.from_dict(test_set),    
    })

    return dataset

In [19]:
dataset = build_dataset_topic("../data/BenchmarkingZeroShot/topic")

In [22]:
set(dataset["train"]["category"])

{'Business & Finance',
 'Computers & Internet',
 'Family & Relationships',
 'Health',
 'Society & Culture'}

In [23]:
dataset.save_to_disk("../data/topic_v0")