In [3]:
import scipy.stats as stats
import numpy as np
from scipy.stats import wilcoxon

# Data preparation
metrics = ['Precision', 'Recall', 'F1-score', 'Accuracy']
models = ["NB", "RF_BGE", "RF_GTE", "RF_UAE", "RF_MRL", "RF_ALI", "RF_MUL", 
          "XGB_BGE", "XGB_GTE", "XGB_UAE", "XGB_MRL", "XGB_ALI", "XGB_MUL", "DistilBERT"]

two_model_data = {
    'Precision': [72.7, 75.0, 77.9, 78.1, 78.3, 78.6, 78.5, 75.5, 76.3, 77.2, 76.5, 78.6, 78.2, 76.8],
    'Recall': [67.2, 73.6, 76.3, 76.3, 76.3, 76.9, 77.2, 74.5, 74.9, 75.9, 74.9, 77.8, 77.5, 74.8],
    'F1-score': [67.4, 73.5, 76.3, 76.2, 76.3, 77.1, 77.4, 74.8, 75.0, 75.9, 75.1, 78.0, 77.7, 75.3],
    'Accuracy': [67.6, 73.4, 76.1, 76.1, 76.1, 76.8, 77.1, 74.4, 74.7, 75.8, 74.7, 77.8, 77.5, 75.4]
}

single_model_data = {
    'Precision': [64.1, 72.7, 71.4, 76.5, 65.0, 72.9, 72.7, 73.6, 70.1, 73.7, 68.5, 77.6, 76.8, 75.6],
    'Recall': [62.5, 70.3, 71.6, 74.3, 64.0, 72.7, 72.1, 74.0, 70.6, 72.7, 68.4, 77.6, 77.4, 75.1],
    'F1-score': [61.4, 68.5, 69.2, 72.6, 61.7, 71.8, 70.8, 72.6, 69.7, 72.3, 67.6, 77.3, 76.6, 75.2],
    'Accuracy': [61.8, 70.3, 71.7, 74.4, 64.5, 73.0, 72.7, 73.7, 71.3, 73.0, 68.9, 77.8, 77.1, 75.4]
}

# Perform Wilcoxon signed-rank test for each metric
for metric in metrics:
    statistic, p_value = wilcoxon(two_model_data[metric], single_model_data[metric])
    print(f"\nWilcoxon signed-rank test for {metric}:")
    print(f"Statistic: {statistic}")
    print(f"p-value: {p_value}")

    if p_value < 0.05:
        print("There is a significant difference between the two-model and single-model approaches.")
    else:
        print("There is no significant difference between the two-model and single-model approaches.")

    # Calculate and print the number of improved models
    improvements = sum(t > s for t, s in zip(two_model_data[metric], single_model_data[metric]))
    print(f"Number of models that improved in the two-model approach: {improvements} out of {len(models)}")

    # Calculate and print average F1-scores
    avg_single = np.mean(single_model_data[metric])
    avg_two = np.mean(two_model_data[metric])
    print(f"Average{metric} (Single-model): {avg_single:.2f}")
    print(f"Average {metric} (Two-model): {avg_two:.2f}")
    print(f"Difference: {avg_two - avg_single:.2f}")


# Pairwise comparisons for each model
print("\nPairwise comparisons for each model:")
for i, model in enumerate(models):
    two_model_perf = [two_model_data[metric][i] for metric in metrics]
    single_model_perf = [single_model_data[metric][i] for metric in metrics]
    
    statistic, p_value = wilcoxon(two_model_perf, single_model_perf)
    print(f"\n{model}:")
    print(f"Wilcoxon statistic: {statistic}")
    print(f"p-value: {p_value}")
    
    if p_value < 0.05:
        print("Significant difference between two-model and single-model approach.")
    else:
        print("No significant difference between two-model and single-model approach.")


Wilcoxon signed-rank test for Precision:
Statistic: 0.0
p-value: 0.0001220703125
There is a significant difference between the two-model and single-model approaches.
Number of models that improved in the two-model approach: 14 out of 14
AveragePrecision (Single-model): 72.23
Average Precision (Two-model): 77.01
Difference: 4.79

Wilcoxon signed-rank test for Recall:
Statistic: 3.0
p-value: 0.0006103515625
There is a significant difference between the two-model and single-model approaches.
Number of models that improved in the two-model approach: 13 out of 14
AverageRecall (Single-model): 71.66
Average Recall (Two-model): 75.29
Difference: 3.63

Wilcoxon signed-rank test for F1-score:
Statistic: 0.0
p-value: 0.0001220703125
There is a significant difference between the two-model and single-model approaches.
Number of models that improved in the two-model approach: 14 out of 14
AverageF1-score (Single-model): 70.52
Average F1-score (Two-model): 75.43
Difference: 4.91

Wilcoxon signed-ra

In [1]:
import scipy.stats as stats
import numpy as np

# Data preparation
humor_styles = ['Self-enhancing', 'Self-deprecating', 'Affiliative', 'Aggressive', 'Neutral']

single_model_data = {
    'Self-enhancing': [61.7, 80.3, 81.9, 82.8, 70.7, 76.9, 85.0, 80.3, 81.6, 80.0, 73.2, 82.6, 86.2, 79.4],
    'Self-deprecating': [66.0, 72.5, 76.7, 80.5, 65.9, 70.5, 66.7, 77.1, 67.4, 75.9, 71.3, 79.1, 77.6, 76.7],
    'Affiliative': [39.2, 40.5, 34.9, 46.5, 33.7, 54.5, 47.3, 50.0, 48.5, 57.4, 48.0, 64.9, 63.0, 60.2],
    'Aggressive': [56.4, 62.7, 69.1, 72.0, 58.9, 71.3, 67.6, 67.2, 65.6, 66.7, 62.8, 74.8, 67.7, 70.8],
    'Neutral': [83.6, 86.3, 83.4, 81.3, 79.5, 85.7, 87.1, 88.2, 85.3, 81.6, 82.6, 85.1, 88.7, 88.7]
}

two_model_data = {
    'Self-enhancing': [56.8, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 86.4, 80.3],
    'Self-deprecating': [66.7, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 80.9, 75.6],
    'Affiliative': [67.6, 50.5, 59.8, 59.0, 61.0, 66.7, 65.5, 57.1, 56.4, 58.2, 58.2, 66.1, 63.9, 61.2],
    'Aggressive': [64.0, 61.5, 65.7, 66.2, 64.8, 63.3, 65.7, 61.1, 62.9, 65.7, 61.4, 68.2, 68.8, 71.2],
    'Neutral': [81.7, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.4, 88.1]
}

# Perform Wilcoxon signed-rank test for each humor style
for style in humor_styles:
    statistic, p_value = stats.wilcoxon(single_model_data[style], two_model_data[style])
    
    print(f"\nWilcoxon signed-rank test for {style} humor:")
    print(f"Statistic: {statistic}")
    print(f"p-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print("There is a significant difference between the two-model and single-model approaches.")
    else:
        print("There is no significant difference between the two-model and single-model approaches.")
    
    # Calculate and print the number of improved models
    improvements = sum(t > s for t, s in zip(two_model_data[style], single_model_data[style]))
    print(f"Number of models that improved in the two-model approach: {improvements} out of 14")
    
    # Calculate and print average F1-scores
    avg_single = np.mean(single_model_data[style])
    avg_two = np.mean(two_model_data[style])
    print(f"Average F1-score (Single-model): {avg_single:.2f}")
    print(f"Average F1-score (Two-model): {avg_two:.2f}")
    print(f"Difference: {avg_two - avg_single:.2f}")

# Overall comparison
all_single = [score for style in humor_styles for score in single_model_data[style]]
all_two = [score for style in humor_styles for score in two_model_data[style]]

overall_statistic, overall_p_value = stats.wilcoxon(all_single, all_two)

print("\nOverall Wilcoxon signed-rank test:")
print(f"Statistic: {overall_statistic}")
print(f"p-value: {overall_p_value:.4f}")

if overall_p_value < 0.05:
    print("There is a significant overall difference between the two-model and single-model approaches.")
else:
    print("There is no significant overall difference between the two-model and single-model approaches.")

overall_improvements = sum(t > s for t, s in zip(all_two, all_single))
print(f"Overall number of improvements in the two-model approach: {overall_improvements} out of {len(all_single)}")

avg_single_overall = np.mean(all_single)
avg_two_overall = np.mean(all_two)
print(f"Overall average F1-score (Single-model): {avg_single_overall:.2f}")
print(f"Overall average F1-score (Two-model): {avg_two_overall:.2f}")
print(f"Overall difference: {avg_two_overall - avg_single_overall:.2f}")


Wilcoxon signed-rank test for Self-enhancing humor:
Statistic: 8.0
p-value: 0.0031
There is a significant difference between the two-model and single-model approaches.
Number of models that improved in the two-model approach: 13 out of 14
Average F1-score (Single-model): 78.76
Average F1-score (Two-model): 83.85
Difference: 5.09

Wilcoxon signed-rank test for Self-deprecating humor:
Statistic: 3.0
p-value: 0.0006
There is a significant difference between the two-model and single-model approaches.
Number of models that improved in the two-model approach: 13 out of 14
Average F1-score (Single-model): 73.14
Average F1-score (Two-model): 79.51
Difference: 6.37

Wilcoxon signed-rank test for Affiliative humor:
Statistic: 0.0
p-value: 0.0001
There is a significant difference between the two-model and single-model approaches.
Number of models that improved in the two-model approach: 14 out of 14
Average F1-score (Single-model): 49.19
Average F1-score (Two-model): 60.80
Difference: 11.61

Wil

# Number of each classes 

In [12]:
import pandas as pd
humor_5class_path = "datasets/Humour_style.xlsx"
df = pd.read_excel(humor_5class_path)
labels = df["LABELS"]
c_0, c_1, c_2, c_3, c_4 = 0, 0, 0, 0, 0

for label in labels:
    if label == 0:
        c_0 +=1
    elif label == 1:
        c_1 +=1
    elif label == 2:
        c_2 +=1
    elif label == 3:
        c_3 +=1
    else:
        c_4 +=1
print(f"Enh- {c_0}\n depr {c_1}\n AFF- {c_2}\n AGG- {c_3}\n Ne- {c_4}")
print(f"Sum: {c_0+c_1+c_2+c_3+c_4}")

Enh- 298
 depr 265
 AFF- 250
 AGG- 318
 Ne- 332
Sum: 1463
