In [1]:
import numpy as np 
import pandas as pd
from pathlib import Path
from scipy.stats import ttest_ind
from sklearn.metrics import f1_score, jaccard_score
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.ensemble import RakelD
from bow import BOW

# Data

source: https://github.com/NLeSC/spudisc-emotion-classification

Preprocessed versions of the data, split for training and testing a classifier, can be found in the files train.txt and test.txt. These contain one sentence per line with labels at the end of each line. A single space separates the labels from the text. Multiple labels are separated by underscores. Where a sentence received no label, the string None appears. (No label means no emotions assigned by the annotator; all sentences have been annotated.)

ToDo: 
- cite and description of data

In [2]:
DATA_FILES = Path('./data/')
TEST_DATA = DATA_FILES / 'test.txt'
TRAIN_DATA = DATA_FILES / 'train.txt'

train_sentences, train_labels = BOW().load_data_raw(TRAIN_DATA)
test_sentences, test_labels = BOW().load_data_raw(TEST_DATA)

In [3]:
print(train_sentences.shape)
print(train_labels.shape)
print(test_sentences.shape)
print(test_labels.shape)

(503,)
(503,)
(126,)
(126,)


# Features

Both algorithms use standard bag-of-words features with stop word removal and optional tf–idf weighting.

## BOW and Tf-idf

described at: https://medium.com/betacom/bow-tf-idf-in-python-for-unsupervised-learning-task-88f3b63ccd6d

scikit BOW: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer

scit tfid: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer

### Stop word removal

described at: https://aisb.org.uk/wp-content/uploads/2019/12/Final-vol-02.pdf#page=59

ToDo
- describe how the features are created

Difficultys:
- what does optional tf-idf mean? (for RAKEL tf-idf weighting with logaritimc tf)
- same features for OvR and RAKEL ?

In [4]:
bow = BOW(
    stop_words=True,
    tfidf=True,
    log=True,
)
X_train, X_test, y_train, y_test = bow.create()

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(503, 2168)
(503, 8)
(126, 2168)
(126, 8)


# Algorithms

## One-vs-Rest

Reduction to Binary classifyers

implementation of liblinear in sklearn is used which can be found in: https://scikit-learn.org/stable/modules/generated/
sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

wrapper: https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html#sklearn.multiclass.OneVsRestClassifier

description at: https://scikit-learn.org/stable/modules/svm.html (chapter 1.4.1.1.)

ToDo
- oversampling
- todo find best parameters per emotion and seperatly optimise models

In [6]:
ovr = OneVsRestClassifier(
    LinearSVC(penalty='l1', dual=False, max_iter=10e3))

## RAKEL

### Method

Implementation
- use skmultilearns RAKELd 
    - http://scikit.ml/api/skmultilearn.ensemble.rakeld.html
    - there the same paper is referenced as in our paper
    - what does the d stand for ? 
- uses linear SVMs as its base learnes
    - https://sklearn.org/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
    - fixed regularization parameter C=1
- no tuning applyed in paper due to time constraints
- use tf-idf weighting with logaritimic tf
- automatic oversampling
    - account for few occurances of some labels and label subsets
- use k=3 for label subsets -> undoing randomization as only 35 size-k subsets of label sets occure


ToDo:
- automatic oversampling

Difficultys:
- what implementation used for RAKEL ?
- what implementation used for base learner (linear SVM)
- not clear if L1 or L2 penalty (use L2 as it is default) ?
- how is the oversampling applied ?
    - what implementation / algorithm used ?
- what to set for base_classifier_require_dense ? 

In [7]:
rakel = RakelD(
    base_classifier=LinearSVC(C=1),
    base_classifier_require_dense=[True, True],
    labelset_size=3
)

# Benchmark

## Method:

Empirically evaluate the algorithms on the dataset
- report scores per class and averaged over all classes
    - F1 scores
    - accuracy
        - overall accuracy: one minus the Hamming loss
- all scores averaged over ten runs of each training algorithm to account for randomization

## Info:
- RAKEL results of all runs achive the exact same scores despite randomization
- RAKEL does not learn to predict "Anger" 
- OvR archives bad score for four runs and zero for remanaing runs for label "Anger"

ToDo
- set seed
- randomize
- parameters of OvR is not set in the beginning of experiments but determined during run
    - does this happen once or for each run ?
- time is also measured but not reported
    - we should capture this

Difficulties
- no seed 
- how to randomize the runs?
    - same seed for both algorithms?
    - how to vary seed ?
- what is used to calculate the overall scores ?
- are OvR hyperparameters optimized once or for every run ? 
- stated that RAKEL faster but no times reported

In [8]:
def benchmark(bow, ovr, rakel, num_runs=10):
    results_all_runs = []

    X_train, X_test, y_train, y_test = bow.create()
    labels = bow.get_labels()

    for n in range(num_runs):
        # run OvR model
        ovr.fit(X_train, y_train)
        ovr_prediction = ovr.predict(X_test)
        ovr_f1 = f1_score(y_test, ovr_prediction, average=None)
        ovr_accuracy = jaccard_score(y_test, ovr_prediction, average=None)

        # run RAKEL model
        rakel.fit(X_train, y_train)
        rakel_prediction = rakel.predict(X_test)
        rakel_f1 = f1_score(y_test, rakel_prediction, average=None)
        rakel_accuracy = jaccard_score(y_test, rakel_prediction, average=None)

        # capture results in DataFrame
        results = pd.DataFrame({
            'Emotion': labels,
            'run': n,
            'ovr_f1': ovr_f1,
            'ovr_accuracy': ovr_accuracy,
            'rakel_f1': rakel_f1,
            'rakel_accuracy': rakel_accuracy,
        })
        results_all_runs.append(results)

    return pd.concat(results_all_runs), labels

In [9]:
results, labels = benchmark(bow, ovr, rakel)

In [10]:
results.shape

(80, 6)

In [11]:
results.head(5)

Unnamed: 0,Emotion,run,ovr_f1,ovr_accuracy,rakel_f1,rakel_accuracy
0,Anger,0,0.0,0.0,0.0,0.0
1,Fear,0,0.285714,0.166667,0.166667,0.090909
2,Interest,0,0.071429,0.037037,0.0,0.0
3,Joy,0,0.5,0.333333,0.2,0.111111
4,Love,0,0.5,0.333333,0.534653,0.364865


In [12]:
labels

array(['Anger', 'Fear', 'Interest', 'Joy', 'Love', 'None', 'Sadness',
       'Surprise'], dtype=object)

# Results

## Method:


ToDo
- compute overall results per emotion for all runs
    - overall accuracy ?
- compute variances

In [13]:
def calulate_results(scores, labels):
    grouped = scores.groupby('Emotion').agg(
        ovr_f1_mean=('ovr_f1', 'mean'),
        ovr_f1_var=('ovr_f1', 'var'),
        ovr_accuracy_mean=('ovr_accuracy', 'mean'),
        ovr_accuracy_var=('ovr_accuracy', 'var'),
        rakel_f1_mean=('rakel_f1', 'mean'),
        rakel_f1_var=('rakel_f1', 'var'),
        rakel_accuracy_mean=('rakel_accuracy', 'mean'),
        rakel_accuracy_var=('rakel_accuracy', 'var'),
    )
    return grouped.round(3)

In [14]:
calulate_results(results, labels)

Unnamed: 0_level_0,ovr_f1_mean,ovr_f1_var,ovr_accuracy_mean,ovr_accuracy_var,rakel_f1_mean,rakel_f1_var,rakel_accuracy_mean,rakel_accuracy_var
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Anger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fear,0.285714,0.0,0.166667,0.0,0.1,0.007407,0.054545,0.002204
Interest,0.071182,6.066636e-07,0.036905,1.749671e-07,0.033391,0.001864,0.017424,0.000508
Joy,0.523077,0.002366864,0.355556,0.002194787,0.314639,0.008368,0.189829,0.004141
Love,0.5,0.0,0.333333,0.0,0.559356,0.001384,0.389109,0.001309
,0.357143,0.0,0.217391,0.0,0.504291,0.001414,0.337914,0.001114
Sadness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Surprise,0.285714,0.0,0.166667,0.0,0.142592,0.007865,0.079042,0.002807


## Test significance between Algorithms

### Method:

Welch's one-sided t-test
- implementation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
- parameters:
    - equal_var=False
        - If False, perform Welch’s t-test, which does not assume equal population variance.
    - alternative='greater'
        - ‘greater’: one-sided


Todo
- compute Welch’s one-sided t-test
    - test significance for alpha = (0.05, 0.001)
- optional: compute other test statistics

In [18]:
def calculate_significance(scores, labels):
    results = []
    for label in labels:
        result = {}
        
        # calculate p-values for F1
        ovr_f1 = scores[scores['Emotion'] == label]['ovr_f1'].values
        rakel_f1 = scores[scores['Emotion'] == label]['rakel_f1'].values
        welch_f1 = ttest_ind(ovr_f1, rakel_f1, equal_var=False, alternative='greater')
        result['f1'] = welch_f1.pvalue

        # calculate p-calues for Accuracy
        ovr_accuracy = scores[scores['Emotion'] == label]['ovr_accuracy'].values
        rakel_accuracy = scores[scores['Emotion'] == label]['rakel_accuracy'].values
        welch_accuracy = ttest_ind(ovr_accuracy, rakel_accuracy, equal_var=False, alternative='greater')
        result['accuracy'] = welch_accuracy.pvalue

        results.append(result)
    return pd.DataFrame(results, index=labels).round(3)

In [19]:
calculate_significance(results, labels)

Unnamed: 0,f1,accuracy
Anger,,
Fear,0.0,0.0
Interest,0.011,0.012
Joy,0.0,0.0
Love,1.0,1.0
,1.0,1.0
Sadness,,
Surprise,0.0,0.0


In [17]:
# ToDO something runs wrong here the p-values for accuracy and f1 are exactly the same and anger and sadness have NaNs