# Static split

Create a dataset with a standartized split

In [1]:
import argparse
import logging
import pprint
import os
import sys
from itertools import chain
from collections import Counter

import torch
import torch.utils.data
import torch.nn.functional as F
import transformers
import datasets
import wandb

from tqdm import tqdm

import class_attention as cat

# Semantic Split

The test classes are pre-selected for this split based on their names

In [10]:
news_dataset = datasets.load_dataset("Fraser/news-category-dataset")
train_set = news_dataset["train"]
test_set = news_dataset["validation"]

Using custom data configuration default
Reusing dataset news_category (/home/vlialin/.cache/huggingface/datasets/news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c)


In [11]:
news_dataset

DatasetDict({
    train: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 160682
    })
    test: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 30128
    })
    validation: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 10043
    })
})

In [4]:
Counter(train_set["category"]).most_common()

[('POLITICS', 26127),
 ('WELLNESS', 14274),
 ('ENTERTAINMENT', 12904),
 ('TRAVEL', 7870),
 ('STYLE & BEAUTY', 7769),
 ('PARENTING', 6971),
 ('HEALTHY LIVING', 5343),
 ('QUEER VOICES', 5034),
 ('FOOD & DRINK', 4990),
 ('BUSINESS', 4744),
 ('COMEDY', 4107),
 ('SPORTS', 3933),
 ('BLACK VOICES', 3649),
 ('HOME & LIVING', 3322),
 ('PARENTS', 3183),
 ('WEDDINGS', 2936),
 ('THE WORLDPOST', 2868),
 ('WOMEN', 2785),
 ('IMPACT', 2769),
 ('DIVORCE', 2728),
 ('CRIME', 2720),
 ('MEDIA', 2266),
 ('WEIRD NEWS', 2124),
 ('RELIGION', 2080),
 ('GREEN', 2074),
 ('WORLDPOST', 2071),
 ('STYLE', 1835),
 ('SCIENCE', 1738),
 ('WORLD NEWS', 1734),
 ('TECH', 1697),
 ('TASTE', 1683),
 ('MONEY', 1366),
 ('ARTS', 1204),
 ('GOOD NEWS', 1131),
 ('FIFTY', 1114),
 ('ARTS & CULTURE', 1065),
 ('ENVIRONMENT', 1052),
 ('COLLEGE', 909),
 ('LATINO VOICES', 900),
 ('CULTURE & ARTS', 824),
 ('EDUCATION', 789)]

### A note on labels

The dataset we use (`Fraser/news-category-dataset`) has some interesting particularities in the class names.

For example, it has classes `STYLE` and `STYLE & BEAUTY` or `WORLD NEWS` and `NEWS`. I.e., some classes contain other classes names in their name.
The classes that have `&` in their name have a similar particularity. Some of the categories does not seem to be distinguishable. E.g., `THE WORLDPOST` and `WORLDPOST` or `ARTS & CULTURE` and `CULTURE & ARTS`.



* &	: STYLE & BEAUTY, ARTS & CULTURE, HOME & LIVING, FOOD & DRINK, CULTURE & ARTS
* VOICES	: LATINO VOICES, BLACK VOICES, QUEER VOICES
* NEWS	: WEIRD NEWS, GOOD NEWS, WORLD NEWS
* ARTS	: ARTS, ARTS & CULTURE, CULTURE & ARTS
* CULTURE	: ARTS & CULTURE, CULTURE & ARTS
* LIVING	: HEALTHY LIVING, HOME & LIVING
* WORLDPOST	: THE WORLDPOST, WORLDPOST
* WORLD	: THE WORLDPOST, WORLDPOST

In [12]:
test_classes = [
    "LATINO VOICES",  # related to BLACK VOICES, QUEER VOICES, 
    "PARENTS",        # related to PARENTING
    "DIVORCE",        # related to WEDDINGS
    "WORLDPOST",      # related to THE WORLDPOST, 
    "STYLE",          # related to STYLE & BEAUTY, 
    "WORLD NEWS",     # related to WEIRD NEWS, GOOD NEWS
    "CULTURE & ARTS", # related to ARTS & CULTURE
    "EDUCATION",      # related to COLLEGE, SCIENCE, 
]

In [13]:
# # TODO: UMAP?

# import umap

# embeddings = BERT(all_classes)

# umap_model = umap.UMAP()
# umap_emb = umap_model.fit_transform(embeddings)

In [14]:
reduced_train_set, _train_set_remainder = cat.utils.split_classes(
    train_set, test_classes=test_classes, verbose=True
)

Moving the following classes to a class-test set: ['LATINO VOICES', 'PARENTS', 'DIVORCE', 'WORLDPOST', 'STYLE', 'WORLD NEWS', 'CULTURE & ARTS', 'EDUCATION']


In [16]:
news_dataset["train"] = reduced_train_set
news_dataset

DatasetDict({
    train: Dataset({
        features: ['authors', 'category', 'category_num', 'date', 'headline', 'link', 'short_description'],
        num_rows: 146618
    })
    test: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 30128
    })
    validation: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 10043
    })
})

In [18]:
news_dataset.save_to_disk("../data/news-category-semantic-split")

In [19]:
p = 0.1
news_dataset["train"] = cat.utils.sample_dataset(news_dataset["train"], p)
news_dataset["validation"] = cat.utils.sample_dataset(news_dataset["validation"], p)
news_dataset["test"] = cat.utils.sample_dataset(news_dataset["test"], p)

news_dataset.save_to_disk("../data/news-category-semantic-split-small")

# Random split

The test classes are randomly chosen

In [22]:
news_dataset = datasets.load_dataset("Fraser/news-category-dataset")
train_set = news_dataset["train"]
test_set = news_dataset["validation"]

Using custom data configuration default
Reusing dataset news_category (/home/vlialin/.cache/huggingface/datasets/news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c)


In [23]:
reduced_train_set, _train_set_remainder = cat.utils.split_classes(
    train_set, p_test_classes=0.2, verbose=True,
)
news_dataset["train"] = reduced_train_set
news_dataset

Moving the following classes to a class-test set: ['COLLEGE', 'ARTS', 'POLITICS', 'RELIGION', 'WORLD NEWS', 'SPORTS', 'EDUCATION', 'TRAVEL']


DatasetDict({
    train: Dataset({
        features: ['authors', 'category', 'category_num', 'date', 'headline', 'link', 'short_description'],
        num_rows: 116036
    })
    test: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 30128
    })
    validation: Dataset({
        features: ['category_num', 'category', 'headline', 'authors', 'link', 'short_description', 'date'],
        num_rows: 10043
    })
})

In [24]:
news_dataset.save_to_disk("../data/news-category-random-split")

In [25]:
p = 0.1
news_dataset["train"] = cat.utils.sample_dataset(news_dataset["train"], p)
news_dataset["validation"] = cat.utils.sample_dataset(news_dataset["validation"], p)
news_dataset["test"] = cat.utils.sample_dataset(news_dataset["test"], p)

news_dataset.save_to_disk("../data/news-category-random-split-small")

# Loading

In [28]:
semantic_small = datasets.load_from_disk("../data/news-category-semantic-split-small")

In [29]:
semantic_small

DatasetDict({
    train: Dataset({
        features: ['authors', 'category', 'category_num', 'date', 'headline', 'link', 'short_description'],
        num_rows: 14661
    })
    test: Dataset({
        features: ['authors', 'category', 'category_num', 'date', 'headline', 'link', 'short_description'],
        num_rows: 3012
    })
    validation: Dataset({
        features: ['authors', 'category', 'category_num', 'date', 'headline', 'link', 'short_description'],
        num_rows: 1004
    })
})