In [1]:
import os
os.chdir('../../../')

In [2]:
from typing import Tuple

import spacy
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from preprocessing.BOWPipeline import BOWPipeline, basic_tokenizing_and_cleaning
from benchmark_subtask_2 import LABELS

# Test different class balancing algorithms

## Let's test it on the largest dataset and its largest version (raw_text)

### Prepare and split the data

In [4]:
# Load example dataset
DATA_DIR = os.path.join('data', 'preprocessed')
en_train_df = pd.read_csv(os.path.join(DATA_DIR, 'input_en_train.csv'), index_col='id')

# Create objects that will preprocess the data
en_nlp = spacy.load('en_core_web_sm')
preproc_pipeline = BOWPipeline(
    tokenizer=lambda string: basic_tokenizing_and_cleaning(string, spacy_nlp_model=en_nlp),
)

mlb = MultiLabelBinarizer()
mlb.fit([LABELS])

In [5]:
X = preproc_pipeline.pipeline.fit_transform(en_train_df.raw_text)
y = mlb.transform(en_train_df.frames.str.lower().str.split(','))

In [6]:
stratified_kfold_splitter = MultilabelStratifiedKFold(n_splits=3, shuffle=True, random_state=123)
train_idx, test_idx = next(stratified_kfold_splitter.split(X, y))
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]

## Create a class to preprocess and calculate basic metrics of a multilabel dataset

In [107]:
def calculate_scrumble(label_mask: np.ndarray, imb_rations: np.ndarray):
    instance_level_ir = label_mask * imb_rations
    scrumble_score_lbls = 1 - (1/instance_level_ir.mean(axis=1, where=label_mask)) * np.power(instance_level_ir.prod(axis=1, where=label_mask), 1/label_mask.sum(axis=1))
    scrumble_score = scrumble_score_lbls.mean()

    return instance_level_ir, scrumble_score_lbls, scrumble_score

class MultiLabelDataset:
    def __init__(self, x, y, labels: Tuple[str, ...], binarized=False):
        self.features = x
        self.target = y
        self.labels = labels
        self.label2idx = {label: idx for idx, label in enumerate(labels)}
        if binarized:
            self.binarized_target = y
        else:
            self._mlb = MultiLabelBinarizer()
            self._mlb.fit([labels])
            self.binarized_target = self._mlb.fit_transform(y)

        self.characteristics, self.majority_labels, self.minority_labels =\
            self._calculate_multilabel_metrics()

    def _calculate_multilabel_metrics(self):
        rel_freq = {
            label: rel_freq for label, rel_freq in
            zip(self.labels, self.binarized_target.sum(axis=0) / self.binarized_target.shape[0])
        }

        rel_freq_array = np.array(list(rel_freq.values()))
        max_freq = rel_freq_array.max()
        imb_ratio = {label: max_freq/rel_freq for label, rel_freq in rel_freq.items()}
        imb_ratio_array = np.array(list(imb_ratio.values()))
        mean_imb_ratio = imb_ratio_array.mean()
        cv_ir = imb_ratio_array.std() / mean_imb_ratio

        instance_level_ir, scrumble_score_lbls, scrumble_score = \
        calculate_scrumble(label_mask=self.binarized_target.astype(bool),
                               imb_rations=np.array(list(imb_ratio.values())))

        characteristics = {
            'relative_frequencies': rel_freq, 'imbalanced_ratios': imb_ratio,
            'mean_imbalanced_ratio': mean_imb_ratio, 'IR_coef_of_variation': cv_ir,
            'instance_level_ir': instance_level_ir, 'scrumble_score_lbls': scrumble_score_lbls,
            'scrumble_score': scrumble_score
        }
        majority_labels = [label for label, imb_ratio in imb_ratio.items() if imb_ratio <= mean_imb_ratio]
        minority_labels = [label for label, imb_ratio in imb_ratio.items() if imb_ratio > mean_imb_ratio]


        return characteristics, majority_labels, minority_labels

    def print_report(self):
        print(f'Majority labels: {self.majority_labels}')
        print(f'Minority labels: {self.minority_labels}')
        print(f'mean_imbalanced_ratio: {self.characteristics["mean_imbalanced_ratio"]}')
        print(f'scrumble_score: {self.characteristics["scrumble_score"]}')
        print(f'IR_coef_of_variation: {self.characteristics["IR_coef_of_variation"]}')
        print(f'imbalanced_ratios" {self.characteristics["imbalanced_ratios"]}')


In [108]:
mlb_dataset = MultiLabelDataset(x=X, y=en_train_df.frames.str.lower().str.split(','), labels=LABELS)

In [109]:
mlb_dataset.print_report()

Majority labels: ['security_and_defense', 'policy_prescription_and_evaluation', 'capacity_and_resources', 'economic', 'cultural_identity', 'health_and_safety', 'quality_of_life', 'legality_constitutionality_and_jurisprudence', 'public_opinion', 'external_regulation_and_reputation']
Minority labels: ['fairness_and_equality', 'crime_and_punishment', 'morality', 'political']
mean_imbalanced_ratio: 3.8333414890009467
scrumble_score: 0.1054402803404845
IR_coef_of_variation: 0.813248130779685
imbalanced_ratios" {'fairness_and_equality': 8.03448275862069, 'security_and_defense': 1.0309734513274338, 'crime_and_punishment': 7.516129032258066, 'morality': 8.321428571428573, 'policy_prescription_and_evaluation': 1.9416666666666669, 'capacity_and_resources': 2.043859649122807, 'economic': 3.819672131147541, 'cultural_identity': 1.1592039800995027, 'health_and_safety': 1.1534653465346536, 'quality_of_life': 3.5846153846153848, 'legality_constitutionality_and_jurisprudence': 1.0, 'political': 10.130

In [28]:
mlb_dataset.majority_labels

['security_and_defense',
 'policy_prescription_and_evaluation',
 'capacity_and_resources',
 'economic',
 'cultural_identity',
 'health_and_safety',
 'quality_of_life',
 'legality_constitutionality_and_jurisprudence',
 'public_opinion',
 'external_regulation_and_reputation']

In [29]:
mlb_dataset.minority_labels

['fairness_and_equality', 'crime_and_punishment', 'morality', 'political']

In [30]:
mlb_dataset.characteristics['mean_imbalanced_ratio']

3.8333414890009467

In [31]:
mlb_dataset.characteristics['scrumble_score']

0.1054402803404845

In [32]:
mlb_dataset.characteristics['IR_coef_of_variation']

0.813248130779685

It does not seem to suffer from a lot of coupling between majority and minority classes, as measured by scrumble score higher than 0.1. It seems we would not have to artificially split samples into two, particularly when we are treating each label as independent

### Test oversampling techniques (particularly time-wise)

#### Test the heuristic of continually upsampling in decreasing order of class frequency

In [7]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

In [24]:
order_series = pd.Series(mlb_dataset.characteristics['imbalanced_ratios']).sort_values(ascending=False)

In [39]:
ros = RandomOverSampler(random_state=42)
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

In [89]:
X_res = mlb_dataset.features.copy()
y_res = mlb_dataset.binarized_target.copy()
y_res_idxs = np.arange(0, y_res.shape[0]).reshape((y_res.shape[0], 1))

In [90]:
for label, imb_ratio in pd.Series(mlb_dataset.characteristics['imbalanced_ratios']).sort_values(ascending=False).iteritems():
    label_idx = mlb_dataset.label2idx[label]
    y_res_idxs, _ = ros.fit_resample(y_res_idxs, y_res[:, label_idx])

    # prepare for next iteration
    y_res = y_res[y_res_idxs[:, 0]]
    y_res_idxs = np.arange(0, y_res.shape[0]).reshape((y_res.shape[0], 1))

In [110]:
mlb_dataset_resampled = MultiLabelDataset(x=y_res, y=y_res, labels=LABELS, binarized=True)

In [111]:
mlb_dataset_resampled.print_report()

Majority labels: ['security_and_defense', 'crime_and_punishment', 'policy_prescription_and_evaluation', 'capacity_and_resources', 'cultural_identity', 'health_and_safety', 'legality_constitutionality_and_jurisprudence', 'political', 'external_regulation_and_reputation']
Minority labels: ['fairness_and_equality', 'morality', 'economic', 'quality_of_life', 'public_opinion']
mean_imbalanced_ratio: 1.79208517668352
scrumble_score: 0.03861619207567673
IR_coef_of_variation: 0.4316613250227376
imbalanced_ratios" {'fairness_and_equality': 3.3755034406572926, 'security_and_defense': 1.232450464682453, 'crime_and_punishment': 1.453852482578829, 'morality': 2.813847747678613, 'policy_prescription_and_evaluation': 1.4113237685594142, 'capacity_and_resources': 1.2305685136087527, 'economic': 3.3176276323825467, 'cultural_identity': 1.0, 'health_and_safety': 1.3097551392188032, 'quality_of_life': 2.0423170646800806, 'legality_constitutionality_and_jurisprudence': 1.2676335099349196, 'political': 1.4

In [112]:
mlb_dataset.print_report()

Majority labels: ['security_and_defense', 'policy_prescription_and_evaluation', 'capacity_and_resources', 'economic', 'cultural_identity', 'health_and_safety', 'quality_of_life', 'legality_constitutionality_and_jurisprudence', 'public_opinion', 'external_regulation_and_reputation']
Minority labels: ['fairness_and_equality', 'crime_and_punishment', 'morality', 'political']
mean_imbalanced_ratio: 3.8333414890009467
scrumble_score: 0.1054402803404845
IR_coef_of_variation: 0.813248130779685
imbalanced_ratios" {'fairness_and_equality': 8.03448275862069, 'security_and_defense': 1.0309734513274338, 'crime_and_punishment': 7.516129032258066, 'morality': 8.321428571428573, 'policy_prescription_and_evaluation': 1.9416666666666669, 'capacity_and_resources': 2.043859649122807, 'economic': 3.819672131147541, 'cultural_identity': 1.1592039800995027, 'health_and_safety': 1.1534653465346536, 'quality_of_life': 3.5846153846153848, 'legality_constitutionality_and_jurisprudence': 1.0, 'political': 10.130

The number of samples increased by a lot, but the average imbalanced ratio and the individual imbalanced ratios where lowered significantly (avg from 3.83 to 1.79) and Scrumble score from (0.1 to 0.038). Let's try multiple rounds of upsampling

In [114]:
for label, imb_ratio in pd.Series(mlb_dataset.characteristics['imbalanced_ratios']).sort_values(ascending=False).iteritems():
    label_idx = mlb_dataset.label2idx[label]
    y_res_idxs, _ = ros.fit_resample(y_res_idxs, y_res[:, label_idx])

    # prepare for next iteration
    y_res = y_res[y_res_idxs[:, 0]]
    y_res_idxs = np.arange(0, y_res.shape[0]).reshape((y_res.shape[0], 1))

In [115]:
y_res_idxs.shape

(13472022, 1)

In [116]:
mlb_dataset_resampled2 = MultiLabelDataset(x=y_res, y=y_res, labels=LABELS, binarized=True)

In [120]:
mlb_dataset_resampled2.print_report()

Majority labels: ['security_and_defense', 'crime_and_punishment', 'policy_prescription_and_evaluation', 'capacity_and_resources', 'cultural_identity', 'health_and_safety', 'quality_of_life', 'legality_constitutionality_and_jurisprudence', 'external_regulation_and_reputation']
Minority labels: ['fairness_and_equality', 'morality', 'economic', 'political', 'public_opinion']
mean_imbalanced_ratio: 1.7508263353921591
scrumble_score: 0.03190887859238276
IR_coef_of_variation: 0.33611351359194136
imbalanced_ratios" {'fairness_and_equality': 2.9190524518532515, 'security_and_defense': 1.2711807066906269, 'crime_and_punishment': 1.6965074352038445, 'morality': 2.6220802747133654, 'policy_prescription_and_evaluation': 1.4820494415387344, 'capacity_and_resources': 1.1797287702798254, 'economic': 2.7077300231684447, 'cultural_identity': 1.0, 'health_and_safety': 1.2720886890284302, 'quality_of_life': 1.7038834784693122, 'legality_constitutionality_and_jurisprudence': 1.3253854543883612, 'political

In [122]:
mlb_dataset.characteristics['relative_frequencies']

{'fairness_and_equality': 0.06728538283062645,
 'security_and_defense': 0.5243619489559165,
 'crime_and_punishment': 0.07192575406032482,
 'morality': 0.06496519721577726,
 'policy_prescription_and_evaluation': 0.27842227378190254,
 'capacity_and_resources': 0.26450116009280744,
 'economic': 0.14153132250580047,
 'cultural_identity': 0.46635730858468677,
 'health_and_safety': 0.46867749419953597,
 'quality_of_life': 0.15081206496519722,
 'legality_constitutionality_and_jurisprudence': 0.5406032482598608,
 'political': 0.05336426914153132,
 'public_opinion': 0.20185614849187936,
 'external_regulation_and_reputation': 0.43155452436194897}

In [121]:
mlb_dataset_resampled.characteristics['relative_frequencies']

{'fairness_and_equality': 0.18776954789417732,
 'security_and_defense': 0.514273614340164,
 'crime_and_punishment': 0.4359567167662031,
 'morality': 0.2252491292360613,
 'policy_prescription_and_evaluation': 0.44909380050646913,
 'capacity_and_resources': 0.51506011080093,
 'economic': 0.19104517601099366,
 'cultural_identity': 0.6338167549674598,
 'health_and_safety': 0.4839200366455494,
 'quality_of_life': 0.31034199631815945,
 'legality_constitutionality_and_jurisprudence': 0.5,
 'political': 0.42940546053257045,
 'public_opinion': 0.3312878663474586,
 'external_regulation_and_reputation': 0.5090533521170584}

In [119]:
mlb_dataset_resampled2.characteristics['relative_frequencies']

{'fairness_and_equality': 0.22702323378034864,
 'security_and_defense': 0.5213206302661917,
 'crime_and_punishment': 0.39062176412716665,
 'morality': 0.2527354839533368,
 'policy_prescription_and_evaluation': 0.44714616707128296,
 'capacity_and_resources': 0.5617331236543408,
 'economic': 0.24474106411049507,
 'cultural_identity': 0.6626927271941806,
 'health_and_safety': 0.5209485257669562,
 'quality_of_life': 0.38893077817123517,
 'legality_constitutionality_and_jurisprudence': 0.5,
 'political': 0.3200318408031103,
 'public_opinion': 0.36458781020399167,
 'external_regulation_and_reputation': 0.45907978772599983}

Alright, so unless we decouple some of them, we will not get cleaner upsamples, even if we iterate further

#### Test REMEDIAL decoupling

It is just breaking instances into two, one that has only whatever minority samples are present and the other that only has the majority samples

#### Test MLSMOTE

#### Test MLSOL

### Do small comparison for all of them
