In [1]:
import numpy as np 
import pandas as pd
from pathlib import Path
from scipy.stats import ttest_ind
from sklearn.metrics import f1_score, jaccard_score
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.ensemble import RakelD
from bow import BOW

# Data

source: https://github.com/NLeSC/spudisc-emotion-classification

Preprocessed versions of the data, split for training and testing a classifier, can be found in the files train.txt and test.txt. These contain one sentence per line with labels at the end of each line. A single space separates the labels from the text. Multiple labels are separated by underscores. Where a sentence received no label, the string None appears. (No label means no emotions assigned by the annotator; all sentences have been annotated.)

ToDo: 
- cite and description of data

In [2]:
DATA_FILES = Path('./data/')
TEST_DATA = DATA_FILES / 'test.txt'
TRAIN_DATA = DATA_FILES / 'train.txt'

train_sentences, train_labels = BOW().load_data_raw(TRAIN_DATA)
test_sentences, test_labels = BOW().load_data_raw(TEST_DATA)

In [3]:
print(train_sentences.shape)
print(train_labels.shape)
print(test_sentences.shape)
print(test_labels.shape)

(503,)
(503,)
(126,)
(126,)


# Features

Both algorithms use standard bag-of-words features with stop word removal and optional tf–idf weighting.

## BOW and Tf-idf

described at: https://medium.com/betacom/bow-tf-idf-in-python-for-unsupervised-learning-task-88f3b63ccd6d

scikit BOW: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer

scit tfid: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer

### Stop word removal

described at: https://aisb.org.uk/wp-content/uploads/2019/12/Final-vol-02.pdf#page=59

ToDo
- describe how the features are created

In [4]:
bow = BOW(
    stop_words=True,
    tfidf=True,
    log=True,
)
X_train, X_test, y_train, y_test = bow.create()

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(503, 2168)
(503, 8)
(126, 2168)
(126, 8)


# Algorithms

## One-vs-Rest

Reduction to Binary classifyers

implementation of liblinear in sklearn is used which can be found in: https://scikit-learn.org/stable/modules/generated/
sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

wrapper: https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html#sklearn.multiclass.OneVsRestClassifier

description at: https://scikit-learn.org/stable/modules/svm.html (chapter 1.4.1.1.)

ToDo
- oversampling
- todo find best parameters per emotion and seperatly optimise models

In [6]:
ovr = OneVsRestClassifier(
    LinearSVC(penalty='l1', dual=False, max_iter=10e3))

## RAKEL

implementation found in: http://scikit.ml/api/skmultilearn.ensemble.rakeld.html

ToDo:
- automatic oversampling

Difficultys:
- not clear if L1 or L2 penalty (use L2 as it is default)

In [7]:
rakel = RakelD(
    base_classifier=LinearSVC(C=1),
    base_classifier_require_dense=[True, True],
    labelset_size=3
)

# Benchmark

ToDo
- set seed

Difficulties
- how many runs were performed to calculate variance

In [8]:
def benchmark(bow, ovr, rakel, num_runs=50):
    results_all_runs = []

    X_train, X_test, y_train, y_test = bow.create()
    labels = bow.get_labels()

    for n in range(num_runs):
        # run OvR model
        ovr.fit(X_train, y_train)
        ovr_prediction = ovr.predict(X_test)
        ovr_f1 = f1_score(y_test, ovr_prediction, average=None)
        ovr_accuracy = jaccard_score(y_test, ovr_prediction, average=None)

        # run RAKEL model
        rakel.fit(X_train, y_train)
        rakel_prediction = rakel.predict(X_test)
        rakel_f1 = f1_score(y_test, rakel_prediction, average=None)
        rakel_accuracy = jaccard_score(y_test, rakel_prediction, average=None)

        # capture results in DataFrame
        results = pd.DataFrame({
            'Emotion': labels,
            'run': n,
            'ovr_f1': ovr_f1,
            'ovr_accuracy': ovr_accuracy,
            'rakel_f1': rakel_f1,
            'rakel_accuracy': rakel_accuracy,
        })
        results_all_runs.append(results)

    return pd.concat(results_all_runs), labels

In [9]:
results, labels = benchmark(bow, ovr, rakel, 20)

In [10]:
results.shape

(160, 6)

In [11]:
results.head(5)

Unnamed: 0,Emotion,run,ovr_f1,ovr_accuracy,rakel_f1,rakel_accuracy
0,Anger,0,0.0,0.0,0.0,0.0
1,Fear,0,0.285714,0.166667,0.0,0.0
2,Interest,0,0.071429,0.037037,0.0,0.0
3,Joy,0,0.5,0.333333,0.266667,0.153846
4,Love,0,0.5,0.333333,0.58,0.408451


In [12]:
labels

array(['Anger', 'Fear', 'Interest', 'Joy', 'Love', 'None', 'Sadness',
       'Surprise'], dtype=object)

# Results

ToDo
- compute overall results per emotion for all runs
- compute variances

In [13]:
def calulate_results(scores, labels):
    grouped = scores.groupby('Emotion').agg(
        ovr_f1_mean=('ovr_f1', 'mean'),
        ovr_f1_var=('ovr_f1', 'var'),
        ovr_accuracy_mean=('ovr_accuracy', 'mean'),
        ovr_accuracy_var=('ovr_accuracy', 'var'),
        rakel_f1_mean=('rakel_f1', 'mean'),
        rakel_f1_var=('rakel_f1', 'var'),
        rakel_accuracy_mean=('rakel_accuracy', 'mean'),
        rakel_accuracy_var=('rakel_accuracy', 'var'),
    )
    return grouped

In [14]:
calulate_results(results, labels)

Unnamed: 0_level_0,ovr_f1_mean,ovr_f1_var,ovr_accuracy_mean,ovr_accuracy_var,rakel_f1_mean,rakel_f1_var,rakel_accuracy_mean,rakel_accuracy_var
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Anger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fear,0.285714,0.0,0.166667,0.0,0.080769,0.011871,0.045455,0.003915
Interest,0.070936,1e-06,0.036772,2.946814e-07,0.024103,0.002013,0.012712,0.000567
Joy,0.534615,0.002943,0.366667,0.002729045,0.346107,0.002725,0.210385,0.001391
Love,0.5,0.0,0.333333,0.0,0.588304,0.000716,0.417214,0.000705
,0.357143,0.0,0.217391,0.0,0.51643,0.001555,0.348987,0.001236
Sadness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Surprise,0.285714,0.0,0.166667,0.0,0.126567,0.008469,0.070109,0.002996


## Test significance between Algorithms

 use Welch's one-sided t-test

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

use parameter equal_var=False

If False, perform Welch’s t-test, which does not assume equal population variance.

use parameter alternative='greater' 

‘greater’: one-sided

Todo
- compute Welch’s one-sided t-test
    - test significance for alpha = (0.05, 0.001)
- optional: compute other test statistics

In [15]:
def calculate_significance(scores, labels):
    results = []
    for label in labels:
        result = {}
        
        # calculate p-values for F1
        ovr_f1 = scores[scores['Emotion'] == label]['ovr_f1'].values
        rakel_f1 = scores[scores['Emotion'] == label]['rakel_f1'].values
        welch_f1 = ttest_ind(ovr_f1, rakel_f1, equal_var=False, alternative='greater')
        result['f1'] = welch_f1.pvalue

        # calculate p-calues for Accuracy
        ovr_accuracy = scores[scores['Emotion'] == label]['ovr_accuracy'].values
        rakel_accuracy = scores[scores['Emotion'] == label]['rakel_accuracy'].values
        welch_accuracy = ttest_ind(ovr_f1, rakel_f1, equal_var=False, alternative='greater')
        result['accuracy'] = welch_accuracy.pvalue

        results.append(result)
    return pd.DataFrame(results, index=labels)

In [16]:
calculate_significance(results, labels)

Unnamed: 0,f1,accuracy
Anger,,
Fear,3.941101e-08,3.941101e-08
Interest,8.384537e-05,8.384537e-05
Joy,6.852655e-14,6.852655e-14
Love,1.0,1.0
,1.0,1.0
Sadness,,
Surprise,1.380361e-07,1.380361e-07


In [17]:
# ToDO something runs wrong here the p-values for accuracy and f1 are exactly the same and anger and sadness have NaNs