In [1]:
# IMPORTS
from scipy.stats import wilcoxon
from scipy.stats import ttest_rel
import numpy as np

# Human BERT detection difficulty on human data vs llm generated data

In [7]:
# POLITIFACT
human_data_sr = np.array([91.8918918918919, 88.75, 80.0, 94.20289855072464, 72.97297297297297])
llm_data_sr = np.array([72.59615384615384, 64.42307692307693, 50.0, 78.36538461538461, 55.52884615384615])

# Perform the Wilcoxon signed-rank test
statistic, p_value = wilcoxon(human_data_sr, llm_data_sr)
print(f'Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

# Perform the paired t-test
t_statistic, t_p_value = ttest_rel(human_data_sr, llm_data_sr)
print(f'Paired t-test statistic: {t_statistic}')
print(f'p-value: {t_p_value}')

Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
Paired t-test statistic: 8.274263903179191
p-value: 0.0011643529665913922


In [8]:
# CoAID
co_human_data_sr = np.array([97.35449735449735, 99.4475138121547, 99.45054945054946, 92.97297297297298, 95.50561797752809])
co_llm_data_sr = np.array([100.0, 100.0, 100.0, 100.0, 100.0])

# Perform the Wilcoxon signed-rank test
co_statistic, co_p_value = wilcoxon(co_human_data_sr, co_llm_data_sr)

print(f'Wilcoxon signed-rank statistic: {co_statistic}')
print(f'p-value: {co_p_value}')

# Perform the paired t-test
t_statistic, t_p_value = ttest_rel(human_data_sr, llm_data_sr)
print(f'Paired t-test statistic: {t_statistic}')
print(f'p-value: {t_p_value}')

Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
Paired t-test statistic: 8.274263903179191
p-value: 0.0011643529665913922


In [9]:
# GossipCop
human_data_sr = np.array([78.54889589905363, 74.55089820359282, 78.18756585879873, 83.38727076591155, 0.0])
llm_data_sr = np.array([76.2589928057554, 71.94244604316546, 71.22302158273382, 75.53956834532374, 0.0])

# Perform the Wilcoxon signed-rank test
statistic, p_value = wilcoxon(human_data_sr, llm_data_sr)

print(f'Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

# Perform the paired t-test
t_statistic, t_p_value = ttest_rel(human_data_sr, llm_data_sr)
print(f'Paired t-test statistic: {t_statistic}')
print(f'p-value: {t_p_value}')

Wilcoxon signed-rank statistic: 0.0
p-value: 0.06788915486182899
Paired t-test statistic: 2.64466365622991
p-value: 0.05730031967210537




------------------------------

# Human & LLM Fine-Tuned Bert vs closest baseline performance on human data (Wilcoxon Only)

In [14]:
# PolitiFact Human Data
human_llm_bert_sr = np.array([90.54054054054053, 71.25, 86.25, 72.46376811594203, 93.24324324324324])
nb_sr = np.array([92.11,92.11,92.11,92.11,92.11])
statistic, p_value = wilcoxon(human_llm_bert_sr, nb_sr)
print(f'SR Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')


human_llm_bert_f1 = np.array([0.93, 0.86, 0.92, 0.87, 0.91])
human_bert_f1 = np.array([0.88, 0.9, 0.88, 0.83, 0.87])
statistic, p_value = wilcoxon(human_llm_bert_f1, human_bert_f1)
print(f'F1 Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

SR Wilcoxon signed-rank statistic: 1.0
p-value: 0.125
F1 Wilcoxon signed-rank statistic: 2.5
p-value: 0.3125


In [15]:
# CoAID Human Data
human_llm_bert_sr = np.array([99.47089947089947, 97.79005524861878, 99.45054945054946, 96.21621621621622, 99.43820224719101])
human_bert = np.array([97.35449735449735, 99.4475138121547, 99.45054945054946, 92.97297297297298, 95.50561797752809])
statistic, p_value = wilcoxon(human_llm_bert_sr, human_bert)
print(f'SR Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_f1 = np.array([0.99, 0.98, 0.97, 0.98, 0.98])
human_bert_f1 = np.array([0.97, 0.83, 0.97, 0.97, 0.96])
statistic, p_value = wilcoxon(human_llm_bert_f1, human_bert_f1)
print(f'F1 Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

SR Wilcoxon signed-rank statistic: 1.0
p-value: 0.14412703481601533
F1 Wilcoxon signed-rank statistic: 0.0
p-value: 0.06559969214707187




In [18]:
# GossipCop Human Data
human_llm_bert_sr = np.array([78.54889589905363, 74.55089820359282, 78.18756585879873, 83.38727076591155, 0.0])
human_nb = np.array([74.92, 74.92, 74.92, 74.92, 74.92])
statistic, p_value = wilcoxon(human_llm_bert_sr, human_nb)
print(f'SR Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_f1 = np.array([0.99, 0.98, 0.97, 0.98, 0.98])
human_nb_f1 = np.array([0.78, 0.79, 0.77, 0.72, 0.43])
statistic, p_value = wilcoxon(human_llm_bert_f1, human_nb_f1)
print(f'F1 Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

SR Wilcoxon signed-rank statistic: 6.0
p-value: 0.8125
F1 Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625


----------------------------------------------------------------------------------------------

# Human & LLM Fine-Tuned Bert vs closest baseline performance on LLM data (Wilcoxon Only)

In [34]:
# POLITIFACT
human_llm_bert_overall = np.array([93.75, 93.02884615384616, 93.75, 93.02884615384616, 96.875])
human_nb_overall = np.array([71.70, 71.70, 71.70, 71.70, 71.70])
statistic, p_value = wilcoxon(human_llm_bert_overall, human_nb_overall)
print(f'Overall Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_hal = np.array([100.0, 100.0, 100.0, 85.0, 100.0])
human_bert_hal = np.array([55.00000000000001, 40.0, 0.0, 85.0, 0.0])
statistic, p_value = wilcoxon(human_llm_bert_hal, human_bert_hal)
print(f'HAL Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_amb = np.array([82.75862068965517, 79.3103448275862, 79.3103448275862, 82.75862068965517, 96.55172413793103])
phi_amb = np.array([59.09, 59.09, 59.09, 59.09, 59.09])
statistic, p_value = wilcoxon(human_llm_bert_amb, phi_amb)
print(f'AMB Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_fal = np.array([79.3103448275862, 79.3103448275862, 79.3103448275862, 86.20689655172413, 96.55172413793103])
phi_fal = np.array([61.9, 61.9, 61.9, 61.9, 61.9])
statistic, p_value = wilcoxon(human_llm_bert_fal, phi_fal)
print(f'FAL Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_inc = np.array([86.20689655172413, 86.20689655172413, 82.75862068965517, 65.51724137931035, 79.3103448275862])
phi_inc = np.array([47.62, 47.62, 47.62, 47.62, 47.62])
statistic, p_value = wilcoxon(human_llm_bert_inc, phi_inc)
print(f'INC Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_out = np.array([86.20689655172413, 82.75862068965517, 82.75862068965517, 96.55172413793103, 89.65517241379311])
phi_out = np.array([86.36, 86.36, 86.36, 86.36, 86.36])
statistic, p_value = wilcoxon(human_llm_bert_out, phi_out)
print(f'OUT Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_rew = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_rew = np.array([94.44444444444444, 87.03703703703704, 68.51851851851852, 96.29629629629629, 75.92592592592592])
statistic, p_value = wilcoxon(human_llm_bert_rew, human_bert_rew)
print(f'REW Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_arb = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_arb = np.array([100.0, 90.0, 95.0, 100.0, 95.0])
statistic, p_value = wilcoxon(human_llm_bert_arb, human_bert_arb)
print(f'ARB Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_tot = np.array([93.10344827586206, 89.65517241379311, 100.0, 86.20689655172413, 96.55172413793103])
phi_tot = np.array([72.22, 72.22, 72.22, 72.22, 72.22])
statistic, p_value = wilcoxon(human_llm_bert_tot, phi_tot)
print(f'TOT Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_uns = np.array([82.75862068965517, 89.65517241379311, 86.20689655172413, 93.10344827586206, 100.0])
phi_uns = np.array([75, 75, 75, 75, 75])
statistic, p_value = wilcoxon(human_llm_bert_uns, phi_uns)
print(f'UNS Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_ope = np.array([100.0, 98.14814814814815, 100.0, 100.0, 100.0])
human_bert_ope = np.array([94.44444444444444, 90.74074074074075, 79.62962962962963, 100.0, 62.96296296296296])
statistic, p_value = wilcoxon(human_llm_bert_ope, human_bert_ope)
print(f'OPE Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_par = np.array([100.0, 98.14814814814815, 100.0, 100.0, 98.14814814814815])
nb_par = np.array([70.27, 70.27, 70.27, 70.27, 70.27])
statistic, p_value = wilcoxon(human_llm_bert_par, nb_par)
print(f'PAR Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

'''human_llm_bert_hea = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
nb_hea = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
statistic, p_value = wilcoxon(human_llm_bert_hea, nb_hea)'''
print(f'HEA Wilcoxon signed-rank statistic: NA')
print(f'p-value: 1')

'''human_llm_bert_pol = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
nb_pol = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
statistic, p_value = wilcoxon(human_llm_bert_pol, nb_pol)'''
print(f'POL Wilcoxon signed-rank statistic: NA')   
print(f'p-value: 1')

Overall Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
HAL Wilcoxon signed-rank statistic: 0.0
p-value: 0.06559969214707187
AMB Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
FAL Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
INC Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
OUT Wilcoxon signed-rank statistic: 7.0
p-value: 1.0
REW Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
ARB Wilcoxon signed-rank statistic: 0.0
p-value: 0.10247043485974937
TOT Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
UNS Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
OPE Wilcoxon signed-rank statistic: 0.0
p-value: 0.06788915486182899
PAR Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
HEA Wilcoxon signed-rank statistic: NA
p-value: 1
POL Wilcoxon signed-rank statistic: NA
p-value: 1




In [37]:
# GOSSIPCOP
human_llm_bert_overall = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_overall = np.array([76.2589928057554, 71.94244604316546, 71.22302158273382, 75.53956834532374, 0.0])
statistic, p_value = wilcoxon(human_llm_bert_overall, human_bert_overall)
print(f'Overall Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_ope = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_ope = np.array([69.56521739130434, 65.21739130434783, 56.52173913043478, 59.57446808510638, 0.0])
statistic, p_value = wilcoxon(human_llm_bert_ope, human_bert_ope)
print(f'OPE Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_par = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_par = np.array([85.1063829787234, 78.26086956521739, 78.26086956521739, 86.95652173913044, 0.0])
statistic, p_value = wilcoxon(human_llm_bert_par, human_bert_par)
print(f'PAR Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

human_llm_bert_rew = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_rew = np.array([73.91304347826086, 72.3404255319149, 78.72340425531915, 80.43478260869566, 0.0])
statistic, p_value = wilcoxon(human_llm_bert_rew, human_bert_rew)
print(f'REW Wilcoxon signed-rank statistic: {statistic}')
print(f'p-value: {p_value}')

Overall Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
OPE Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
PAR Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625
REW Wilcoxon signed-rank statistic: 0.0
p-value: 0.0625


In [38]:
# COAID
'''human_llm_bert_overall = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_overall = np.array([100,100,100,100,100])
statistic, p_value = wilcoxon(human_llm_bert_overall, human_bert_overall)'''
print(f'Overall Wilcoxon signed-rank statistic: NA')
print(f'p-value: 1')

'''human_llm_bert_ope = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_ope = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
statistic, p_value = wilcoxon(human_llm_bert_ope, human_bert_ope)'''
print(f'OPE Wilcoxon signed-rank statistic: NA')
print(f'p-value: 1')

'''human_llm_bert_par = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_par = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
statistic, p_value = wilcoxon(human_llm_bert_par, human_bert_par)'''
print(f'PAR Wilcoxon signed-rank statistic: NA')
print(f'p-value: 1')

'''human_llm_bert_rew = np.array([100.0, 100.0, 100.0, 100.0, 100.0])
human_bert_rew = np.array(100.0, 100.0, 100.0, 100.0, 100.0)
statistic, p_value = wilcoxon(human_llm_bert_rew, human_bert_rew)'''
print(f'REW Wilcoxon signed-rank statistic: NA')
print(f'p-value: 1')

Overall Wilcoxon signed-rank statistic: NA
p-value: 1
OPE Wilcoxon signed-rank statistic: NA
p-value: 1
PAR Wilcoxon signed-rank statistic: NA
p-value: 1
REW Wilcoxon signed-rank statistic: NA
p-value: 1
