In [1]:
import pandas as pd
import numpy as np
import ast

from scipy.stats import shapiro, wilcoxon, binom_test
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report, confusion_matrix, multilabel_confusion_matrix

import statsmodels.api as sm
from statsmodels.stats.contingency_tables import mcnemar
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import Counter

In [2]:
def extract_f1_scores_aspects(data_dict):
    f1_scores = []
    for category, metrics in data_dict.items():
        if category != 'micro avg' and category != 'macro avg' and category != 'weighted avg' and category != 'samples avg':
            f1_scores.append(metrics['f1-score'])
    return f1_scores


def extract_f1_scores_sentiments(data_dict):
    f1_scores = {}
    for category, metrics in data_dict.items():
        if category != 'accuracy' and category != 'macro avg' and category != 'weighted avg':
            f1_scores[category] = metrics['f1-score']
    return f1_scores

# SVM aspects

In [3]:
SVM_results_aspect = pd.read_csv("data/predictions/SVM_aspects_test.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(SVM_results_aspect['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(SVM_results_aspect['predictions'].apply(lambda x: ast.literal_eval(x)))

print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=4))
scores_SVM_aspects = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_SVM = extract_f1_scores_aspects(scores_SVM_aspects)
shapiro(f1_scores_SVM)
print(SVM_results_aspect.shape)

                       precision    recall  f1-score   support

            afspraken     1.0000    0.1739    0.2963        23
         communicatie     0.5000    0.2000    0.2857        80
              contact     0.5278    0.2754    0.3619        69
persoonlijke aandacht     0.6299    0.5774    0.6025       168
 roosters en planning     0.8000    0.2667    0.4000        45
              salaris     0.7667    0.3966    0.5227        58

            micro avg     0.6310    0.3860    0.4790       443
            macro avg     0.7041    0.3150    0.4115       443
         weighted avg     0.6449    0.3860    0.4609       443
          samples avg     0.3225    0.2958    0.2991       443

(479, 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SVM sentiments

In [4]:
SVM_results_sent = pd.read_csv("data/predictions/SVM_sentiments_test.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = SVM_results_sent['sentiment']
y_pred = SVM_results_sent['predicted_sentiment']

print(classification_report(y_true, y_pred, digits=4))
output_dict = classification_report(y_true, y_pred, output_dict=True)
SVM_scores_sentiments = extract_f1_scores_sentiments(output_dict)


              precision    recall  f1-score   support

    negatief     0.8937    0.9318    0.9124       352
    positief     0.6842    0.5714    0.6228        91

    accuracy                         0.8578       443
   macro avg     0.7890    0.7516    0.7676       443
weighted avg     0.8507    0.8578    0.8529       443



# MLP aspects

In [5]:
MLP_results_aspect = pd.read_csv("data/predictions/MLP_aspects_withoutDA.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(MLP_results_aspect['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(MLP_results_aspect['predictions'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=4))
scores_MLP_aspects = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_MLP_asp = extract_f1_scores_aspects(scores_MLP_aspects)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# MLP sentiments

In [6]:
MLP_results_sentiment = pd.read_csv("data/predictions/MLP_sentiments_withoutDA.csv", sep=';')


y_true = MLP_results_sentiment['sentiment']
y_pred = MLP_results_sentiment['predictions']

#print(classification_report(y_true, y_pred, digits=4))
scores_MLP_sent = classification_report(y_true, y_pred, output_dict=True)

f1_scores_MLP_sent = extract_f1_scores_sentiments(scores_MLP_sent)

# BERTje zero aspects

In [7]:
BERTjezero_results_aspect = pd.read_csv("data/predictions/BERTje_zeroshot_aspects_withoutDA.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(BERTjezero_results_aspect['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(BERTjezero_results_aspect['predicted_aspects'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=4))
scores_BERTjezero_aspects = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_BERTjezero_asp = extract_f1_scores_aspects(scores_BERTjezero_aspects)

Counter(BERTjezero_results_aspect['predicted_aspects'])

381/443

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.8600451467268623

# BERTje zero sentiments

In [8]:
BERTjezero_results_sentiment = pd.read_csv("data/predictions/BERTje_zeroshot_sentiments_withoutDA.csv", sep=';')


y_true = BERTjezero_results_sentiment['sentiment']
y_pred = BERTjezero_results_sentiment['predicted_sentiment']

#print(classification_report(y_true, y_pred, digits=4))
scores_BERTjezero_sent = classification_report(y_true, y_pred, output_dict=True)

f1_scores_BERTjezero_sent = extract_f1_scores_sentiments(scores_BERTjezero_sent)

# RobBERT zero aspects

In [9]:
RobBERTzero_results_aspect = pd.read_csv("data/predictions/RobBERT_zeroshot_aspects_withoutDA.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(RobBERTzero_results_aspect['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(RobBERTzero_results_aspect['predicted_aspects'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=4))
scores_RobBERTzero_aspects = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_RobBERTzero_asp = extract_f1_scores_aspects(scores_RobBERTzero_aspects)

  _warn_prf(average, modifier, msg_start, len(result))


# RobBERT zero sentiments

In [10]:
RobBERTzero_results_sentiment = pd.read_csv("data/predictions/RobBERT_zeroshot_sentiments_withoutDA.csv", sep=';')


y_true = RobBERTzero_results_sentiment['sentiment']
y_pred = RobBERTzero_results_sentiment['predicted_sentiment']

#print(classification_report(y_true, y_pred, digits=3))
scores_RobBERTzero_sent = classification_report(y_true, y_pred, output_dict=True)

f1_scores_RobBERTzero_sent = extract_f1_scores_sentiments(scores_RobBERTzero_sent)

# BERTje few aspects

In [11]:
BERTjefew_results_aspect = pd.read_csv("data/predictions/BERTje_fewshot_aspects_test_withoutDA.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(BERTjefew_results_aspect['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(BERTjefew_results_aspect['decoded_predictions'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=3))
scores_BERTjefew_aspects = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_BERTjefew_asp = extract_f1_scores_aspects(scores_BERTjefew_aspects)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# BERTje few sentiments

In [12]:
BERTjefew_results_sentiment = pd.read_csv("data/predictions/BERTje_fewshot_sentiments_withoutDA.csv", sep=';')


y_true = BERTjefew_results_sentiment['targets']
y_pred = BERTjefew_results_sentiment['prediction']

#print(classification_report(y_true, y_pred, digits=4))
scores_BERTjefew_sent = classification_report(y_true, y_pred, output_dict=True)

f1_scores_BERTjefew_sent = extract_f1_scores_sentiments(scores_BERTjefew_sent)

## RobBERT few aspects

In [13]:
RobBERTfew_results_aspect = pd.read_csv("data/predictions/RobBERT_fewshot_aspects_withoutDA.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(RobBERTfew_results_aspect['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(RobBERTfew_results_aspect['decoded_predictions'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=4))
scores_RobBERTfew_aspects = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_RobBERTfew_asp = extract_f1_scores_aspects(scores_RobBERTfew_aspects)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# RobBERT few sentiments

In [14]:
RobBERTfew_results_sentiment = pd.read_csv("data/predictions/RobBERT_fewshot_sentiments_withoutDA.csv", sep=';')


y_true = RobBERTfew_results_sentiment['targets']
y_pred = RobBERTfew_results_sentiment['predictions']

#print(classification_report(y_true, y_pred, digits=4))
scores_RobBERTfew_sent = classification_report(y_true, y_pred, output_dict=True)

f1_scores_RobBERTfew_sent = extract_f1_scores_sentiments(scores_RobBERTfew_sent)

# BERTje_few_aspects_DA

In [16]:
BERTjefew_results_aspectDA = pd.read_csv("data/predictions/BERTje_fewshot_aspects_test_withDA.csv", sep=';')

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(BERTjefew_results_aspectDA['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(BERTjefew_results_aspectDA['decoded_predictions'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, target_names=mlb.classes_, digits=4))
scores_BERTjefew_aspectsDA = classification_report(y_true, y_pred, target_names=mlb.classes_, output_dict=True)

f1_scores_BERTjefew_aspDA = extract_f1_scores_aspects(scores_BERTjefew_aspectsDA)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# BERTje_few_sentiments_DA

In [17]:
BERTjefew_results_sentimentDA = pd.read_csv("data/predictions/BERTje_fewshot_sentiments_withDA.csv", sep=';')


y_true = BERTjefew_results_sentimentDA['targets']
y_pred = BERTjefew_results_sentimentDA['prediction']

#print(classification_report(y_true, y_pred, digits=4))
scores_BERTjefew_sentDA = classification_report(y_true, y_pred, output_dict=True)

f1_scores_BERTjefew_sentDA = extract_f1_scores_sentiments(scores_BERTjefew_sentDA)

# RobBERT few aspects_DA

In [18]:
RobBERTfew_results_aspectDA = pd.read_csv("data/predictions/RobBERT_fewshot_aspects_withDA.csv", sep=';')
print(RobBERTfew_results_aspectDA.columns)

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(RobBERTfew_results_aspectDA['clean_annotation'].apply(lambda x: ast.literal_eval(x)))
y_pred = mlb.fit_transform(RobBERTfew_results_aspectDA['decoded_predictions'].apply(lambda x: ast.literal_eval(x)))

#print(classification_report(y_true, y_pred, digits=4))
scores_RobBERTfew_aspectDA = classification_report(y_true, y_pred, output_dict=True)

f1_scores_RobBERTfew_aspectDA = extract_f1_scores_aspects(scores_RobBERTfew_aspectDA)

Index(['Unnamed: 0', 'text', 'common_annotation', 'clean_annotation', 'onehot',
       'predictions', 'decoded_predictions'],
      dtype='object')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# RobBERT few sentiments_DA

In [19]:
RobBERTfew_results_sentimentDA = pd.read_csv("data/predictions/RobBERT_fewshot_sentiments_withDA.csv", sep=';')


y_true = RobBERTfew_results_sentimentDA['encoded_polarity']
y_pred = RobBERTfew_results_sentimentDA['predictions']

#print(classification_report(y_true, y_pred, digits=4))
scores_RobBERTfew_sentDA = classification_report(y_true, y_pred, output_dict=True)

f1_scores_RobBERTfew_sentDA = extract_f1_scores_sentiments(scores_RobBERTfew_sentDA)

In [20]:
df = RobBERTfew_results_aspect
df['gold_label'] = df['clean_annotation'].apply(lambda x: ast.literal_eval(x)) 
df['RobBERTfew_label'] = df['decoded_predictions'].apply(lambda x: ast.literal_eval(x))
df['BERTjefew_label'] = BERTjefew_results_aspect['decoded_predictions'].apply(lambda x: ast.literal_eval(x))
df['SVM_label'] = SVM_results_aspect['predictions'].apply(lambda x: ast.literal_eval(x))
df['MLP_label'] = MLP_results_aspect['predictions'].apply(lambda x: ast.literal_eval(x))
df['RobBERTzero_label'] = RobBERTzero_results_aspect['predicted_aspects'].apply(lambda x: ast.literal_eval(x))
df['BERTjezero_label'] = BERTjezero_results_aspect['predicted_aspects'].apply(lambda x: ast.literal_eval(x))
df['RobBERT_fewDA_label'] = RobBERTfew_results_aspectDA['decoded_predictions'].apply(lambda x: ast.literal_eval(x))
df['BERTje_fewDA_label'] = BERTjefew_results_aspectDA['decoded_predictions'].apply(lambda x: ast.literal_eval(x))

In [21]:
columns_to_compare = ['RobBERTfew_label', 'SVM_label', 'BERTjefew_label', 'MLP_label', 'RobBERTzero_label',
                      'BERTjezero_label', 'RobBERT_fewDA_label', 'BERTje_fewDA_label']
convert_to_list = lambda x: list(x) if isinstance(x, tuple) else x

# Iterate over the columns
for column in columns_to_compare:
    # Create a new column name by appending '_binary' to the original column name
    new_column = column + '_binary'
    # Compare the values in the column with the 'clean_annotation' column after converting tuples to lists
    df[column] = df[column].apply(lambda x: list(x))
    df[new_column] = (df[column].apply(lambda x: list(x) if isinstance(x, tuple) else x) == df['gold_label']).astype(int)

In [22]:
wilcoxon(df['RobBERTfew_label_binary'], df['RobBERTzero_label_binary'])

WilcoxonResult(statistic=0.0, pvalue=3.8071669115004615e-59)

In [23]:
wilcoxon(df['RobBERTfew_label_binary'], df['SVM_label_binary'])

WilcoxonResult(statistic=3060.0, pvalue=5.220985924053902e-09)

In [24]:
wilcoxon(df['RobBERTfew_label_binary'], df['MLP_label_binary'])

WilcoxonResult(statistic=3818.0, pvalue=1.323195307423342e-08)

In [25]:
wilcoxon(df['RobBERTfew_label_binary'], df['BERTjefew_label_binary'])

WilcoxonResult(statistic=2486.0, pvalue=0.023342202012890816)

In [26]:
wilcoxon(df['BERTjefew_label_binary'], df['BERTjezero_label_binary'])

WilcoxonResult(statistic=120.5, pvalue=2.905893776313556e-53)

In [27]:
wilcoxon(df['BERTjefew_label_binary'], df['SVM_label_binary'])

WilcoxonResult(statistic=3850.5, pvalue=8.88494191145669e-05)

In [28]:
wilcoxon(df['BERTjefew_label_binary'], df['MLP_label_binary'])

WilcoxonResult(statistic=3750.0, pvalue=5.963854882444082e-05)

In [29]:
wilcoxon(df['BERTjefew_label_binary'], df['BERTje_fewDA_label_binary'])

WilcoxonResult(statistic=1085.0, pvalue=0.39939570366685395)

In [30]:
wilcoxon(df['RobBERTfew_label_binary'], df['RobBERT_fewDA_label_binary'])

WilcoxonResult(statistic=887.5, pvalue=0.01682740948275685)

# sentiments

In [31]:
df_sent = RobBERTfew_results_sentiment
df_sent['gold_label'] = df_sent['sentiment']
df_sent['RobBERTfew_label'] = df_sent['predictions']
df_sent['BERTjefew_label'] = BERTjefew_results_sentiment['prediction']
df_sent['SVM_label'] = SVM_results_sent['predicted_sentiment']
df_sent['MLP_label'] = MLP_results_sentiment['predictions']
df_sent['RobBERTzero_label'] = RobBERTzero_results_sentiment['predicted_sentiment']
df_sent['BERTjezero_label'] = BERTjezero_results_sentiment['predicted_sentiment']
df_sent['RobBERT_fewDA_label'] = RobBERTfew_results_sentimentDA['predictions']
df_sent['BERTje_fewDA_label'] = BERTjefew_results_sentimentDA['prediction']

In [32]:
columns_to_compare = ['RobBERTfew_label', 'SVM_label', 'BERTjefew_label', 'MLP_label', 'RobBERTzero_label',
                      'BERTjezero_label', 'RobBERT_fewDA_label', 'BERTje_fewDA_label']

df_sent['gold_label'] = df_sent['gold_label'].replace({'negatief': 0, 'positief': 1})

# Iterate over the columns
for column in columns_to_compare:
    print(column)
    df_sent[column] = df_sent[column].replace({'negatief': 0, 'positief': 1}).astype(int)
    # Create a new column name by appending '_binary' to the original column name
    new_column = column + '_binary'
    # Compare the values in the column with the 'gold_label' column
    df_sent[new_column] = (df_sent[column] == df_sent['gold_label']).astype(int)

RobBERTfew_label
SVM_label
BERTjefew_label
MLP_label
RobBERTzero_label
BERTjezero_label
RobBERT_fewDA_label
BERTje_fewDA_label


In [33]:
df_sent

Unnamed: 0.1,Unnamed: 0,text,label,sentiment,common_annotation,clean_annotation,encoded_polarity,predictions,targets,gold_label,...,RobBERT_fewDA_label,BERTje_fewDA_label,RobBERTfew_label_binary,SVM_label_binary,BERTjefew_label_binary,MLP_label_binary,RobBERTzero_label_binary,BERTjezero_label_binary,RobBERT_fewDA_label_binary,BERTje_fewDA_label_binary
0,0,Betere verdeling van uren het verplicht stoppe...,roosters en planning,negatief,['roosters-planning_NEG'],['roosters en planning'],0,0,0,0,...,0,1,1,1,1,1,0,0,1,0
1,1,Zou fijn zijn als ik bezoek zou krijgen van We...,persoonlijke aandacht,negatief,['persoonlijke-aandacht_NEG'],['persoonlijke aandacht'],0,0,0,0,...,0,0,1,1,1,1,1,0,1,1
2,3,"Meedenken carrire, voorstel betere functies, s...",persoonlijke aandacht,negatief,"['persoonlijke-aandacht_NEG', 'salaris_NEG']","['persoonlijke aandacht', 'salaris']",0,0,0,0,...,0,0,1,1,1,1,0,0,1,1
3,4,"Meedenken carrire, voorstel betere functies, s...",salaris,negatief,"['persoonlijke-aandacht_NEG', 'salaris_NEG']","['persoonlijke aandacht', 'salaris']",0,1,0,0,...,0,0,0,1,1,1,0,0,1,1
4,5,Beetje meer waardering zou top zijn met het ke...,persoonlijke aandacht,negatief,['persoonlijke-aandacht_NEG'],['persoonlijke aandacht'],0,0,0,0,...,0,0,1,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,560,Ik heb het idee dat nog heel veel intercedente...,persoonlijke aandacht,negatief,['persoonlijke-aandacht_NEG'],['persoonlijke aandacht'],0,0,0,0,...,0,0,1,0,1,1,1,0,1,1
439,561,Er mag wel meer uitgelegd worden hoe het werkt...,communicatie,negatief,['communicatie_NEG'],['communicatie'],0,0,1,0,...,0,0,1,1,1,1,0,0,1,1
440,562,De menselijke maat is een belangrijke succesfa...,persoonlijke aandacht,positief,['persoonlijke-aandacht_POS'],['persoonlijke aandacht'],1,1,1,1,...,0,0,1,0,0,0,0,1,0,0
441,564,Misschien is het een idee om tijdens de onboar...,communicatie,negatief,['communicatie_NEG'],['communicatie'],0,0,0,0,...,0,0,1,1,1,1,1,0,1,1


In [34]:
def perform_mcnemar_test(series1, series2):
    # Create a contingency table
    a = sum((series1 == 1) & (series2 == 1))
    b = sum((series1 == 0) & (series2 == 1))
    c = sum((series1 == 1) & (series2 == 0))
    d = sum((series1 == 0) & (series2 == 0))
    contingency_table = [[a, b], [c, d]]

    # Perform McNemar's test
    result = mcnemar(contingency_table)

    # Print the test statistic and p-value
    print("McNemar's test statistic:", result.statistic)
    print("p-value:", result.pvalue)
    print()
# Example usage:
# Assuming you have two Series: series1 and series2
perform_mcnemar_test(df_sent['BERTjefew_label_binary'], df_sent['RobBERTfew_label_binary'])
perform_mcnemar_test(df_sent['BERTjefew_label_binary'], df_sent['BERTjezero_label_binary'])
perform_mcnemar_test(df_sent['RobBERTfew_label_binary'], df_sent['RobBERTzero_label_binary'])
perform_mcnemar_test(df_sent['BERTjefew_label_binary'], df_sent['SVM_label_binary'])
perform_mcnemar_test(df_sent['BERTjefew_label_binary'], df_sent['MLP_label_binary'])
perform_mcnemar_test(df_sent['RobBERTfew_label_binary'], df_sent['SVM_label_binary'])
perform_mcnemar_test(df_sent['RobBERTfew_label_binary'], df_sent['MLP_label_binary'])
perform_mcnemar_test(df_sent['RobBERTfew_label_binary'], df_sent['RobBERT_fewDA_label_binary'])
perform_mcnemar_test(df_sent['BERTjefew_label_binary'], df_sent['BERTje_fewDA_label_binary'])


McNemar's test statistic: 69.0
p-value: 0.2916544513344335

McNemar's test statistic: 73.0
p-value: 1.1027112080165591e-22

McNemar's test statistic: 79.0
p-value: 5.4544656666487084e-06

McNemar's test statistic: 32.0
p-value: 7.793986201015429e-11

McNemar's test statistic: 29.0
p-value: 1.4292764071780264e-09

McNemar's test statistic: 27.0
p-value: 1.4874493889233367e-14

McNemar's test statistic: 27.0
p-value: 1.1331322341928529e-12

McNemar's test statistic: 58.0
p-value: 0.051523568581622346

McNemar's test statistic: 63.0
p-value: 0.6030463891288799

