In [1]:
# !pip install statsmodels

In [2]:
import pandas as pd
import numpy as np
import math
import statsmodels as sm

from statsmodels.stats.contingency_tables import mcnemar, cochrans_q


print("Statsmodels version:", sm.__version__)
print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)

Statsmodels version: 0.14.6
Numpy version: 2.3.3
Pandas version: 2.3.3


# Classification performance results

In [3]:
# ground truth labels
y_true = pd.read_csv('artifacts/bert/eval_predictions.csv')['label']

print(y_true)

0       8
1       8
2       0
3       0
4       0
       ..
2475    1
2476    7
2477    0
2478    5
2479    5
Name: label, Length: 2480, dtype: int64


## BERT vs RoBERTa vs BiLSTM

In [4]:
y_pred_bert = pd.read_csv('artifacts/bert/eval_predictions.csv')['y_pred']
y_pred_roberta = pd.read_csv('artifacts/roberta/eval_predictions.csv')['y_pred']
y_pred_bilstm = pd.read_csv('artifacts/bilstm/eval_predictions.csv')['y_pred']

cochrans_q_result = cochrans_q(
    np.vstack([y_pred_bert, y_pred_roberta, y_pred_bilstm])
)

print(cochrans_q_result)

df          2479
pvalue      4.80109917232847e-192
statistic   5175.821077702536


In [5]:
Q = cochrans_q_result.statistic
n = 2480
k = 3

R = (Q - (k -1)) / ((k - 1) * (n - 1))

print(R)

1.043529866418422


### Homogeneity of covariance

In [6]:
np.cov(np.vstack([y_pred_bert == y_true, y_pred_roberta == y_true, y_pred_bilstm == y_true]))

array([[0.15190032, 0.1087049 , 0.07084201],
       [0.1087049 , 0.16507551, 0.07029516],
       [0.07084201, 0.07029516, 0.2244647 ]])

## BERT vs RoBERTa

In [7]:
# bert and roberta predictions
y_pred_bert = pd.read_csv('artifacts/bert/eval_predictions.csv')['y_pred']
y_pred_roberta = pd.read_csv('artifacts/roberta/eval_predictions.csv')['y_pred']

print("BERT predictions")
print(y_pred_bert)

print("RoBERTa predictions")
print(y_pred_roberta)

BERT predictions
0       8
1       8
2       0
3       0
4       0
       ..
2475    1
2476    7
2477    0
2478    5
2479    5
Name: y_pred, Length: 2480, dtype: int64
RoBERTa predictions
0       8
1       8
2       0
3       0
4       0
       ..
2475    1
2476    7
2477    0
2478    5
2479    5
Name: y_pred, Length: 2480, dtype: int64


In [8]:
# contingency table agreement template
contingency_table = pd.DataFrame(
    [['n_11', 'n_10'],
     ['n_01', 'n_00']],
    index=['BERT Correct', 'BERT Wrong'],
    columns=['RoBERTa Correct', 'RoBERTa Wrong']
)

contingency_table

Unnamed: 0,RoBERTa Correct,RoBERTa Wrong
BERT Correct,n_11,n_10
BERT Wrong,n_01,n_00


In [9]:
n_11 = np.sum((y_pred_bert == y_true) & (y_pred_roberta == y_true))
n_10 = np.sum((y_pred_bert == y_true) & (y_pred_roberta != y_true))
n_01 = np.sum((y_pred_bert != y_true) & (y_pred_roberta == y_true))
n_00 = np.sum((y_pred_bert != y_true) & (y_pred_roberta != y_true))

table = np.array([[n_11, n_10],
                  [n_01, n_00]])

print(table)

mcnemar_result = mcnemar(table, exact=False)

print(mcnemar_result)

[[1866  151]
 [  97  366]]
pvalue      0.0007640413512558077
statistic   11.326612903225806


In [10]:
bert_roberta_odds_ratio = n_10 / n_01

print("Effect size (odds ratio) between BERT and RoBERTa:", bert_roberta_odds_ratio)

Effect size (odds ratio) between BERT and RoBERTa: 1.556701030927835


In [11]:
SE = math.sqrt(1/n_10 + 1/n_01)
lower_ci_bert_roberta = math.exp(math.log(bert_roberta_odds_ratio) - 1.96 * SE)
upper_ci_bert_roberta = math.exp(math.log(bert_roberta_odds_ratio) + 1.96 * SE)

print(f"95% CI for odds ratio between BERT and RoBERTa: ({lower_ci_bert_roberta}, {upper_ci_bert_roberta})")


95% CI for odds ratio between BERT and RoBERTa: (1.2062655759068208, 2.0089424319930824)


## BERT vs BiLSTM

In [12]:
# bert and bilstm predictions
y_pred_bert = pd.read_csv('artifacts/bert/eval_predictions.csv')['y_pred']
y_pred_bilstm = pd.read_csv('artifacts/bilstm/eval_predictions.csv')['y_pred']

print("BERT predictions")
print(y_pred_bert)

print("RoBERTa predictions")
print(y_pred_bilstm)

BERT predictions
0       8
1       8
2       0
3       0
4       0
       ..
2475    1
2476    7
2477    0
2478    5
2479    5
Name: y_pred, Length: 2480, dtype: int64
RoBERTa predictions
0       2
1       0
2       1
3       1
4       1
       ..
2475    0
2476    7
2477    0
2478    5
2479    5
Name: y_pred, Length: 2480, dtype: int64


In [13]:
# contingency table agreement template
contingency_table = pd.DataFrame(
    [['n_11', 'n_10'],
     ['n_01', 'n_00']],
    index=['BERT Correct', 'BERT Wrong'],
    columns=['BiLSTM Correct', 'BiLSTM Wrong']
)

contingency_table

Unnamed: 0,BiLSTM Correct,BiLSTM Wrong
BERT Correct,n_11,n_10
BERT Wrong,n_01,n_00


In [14]:
n_11 = np.sum((y_pred_bert == y_true) & (y_pred_bilstm == y_true))
n_10 = np.sum((y_pred_bert == y_true) & (y_pred_bilstm != y_true))
n_01 = np.sum((y_pred_bert != y_true) & (y_pred_bilstm == y_true))
n_00 = np.sum((y_pred_bert != y_true) & (y_pred_bilstm != y_true))

table = np.array([[n_11, n_10],
                  [n_01, n_00]])

print(table)

mcnemar_result = mcnemar(table, exact=False)

print(mcnemar_result)

[[1507  510]
 [ 130  333]]
pvalue      9.73091609290084e-51
statistic   224.4390625


In [15]:
bert_bilstm_odds_ratio = n_10 / n_01

print("Effect size (odds ratio) between BERT and BiLSTM:", bert_bilstm_odds_ratio)

Effect size (odds ratio) between BERT and BiLSTM: 3.923076923076923


In [16]:
SE = math.sqrt(1/n_10 + 1/n_01)
lower_ci_bert_bilstm = math.exp(math.log(bert_bilstm_odds_ratio) - 1.96 * SE)
upper_ci_bert_bilstm = math.exp(math.log(bert_bilstm_odds_ratio) + 1.96 * SE)

print(f"95% CI for odds ratio between BERT and BiLSTM: ({lower_ci_bert_bilstm}, {upper_ci_bert_bilstm})")

95% CI for odds ratio between BERT and BiLSTM: (3.2358963906250886, 4.756188297303813)


## RoBERTa vs BiLSTM

In [17]:
# roberta and bilstm predictions
y_pred_roberta = pd.read_csv('artifacts/roberta/eval_predictions.csv')['y_pred']
y_pred_bilstm = pd.read_csv('artifacts/bilstm/eval_predictions.csv')['y_pred']

print("BERT predictions")
print(y_pred_roberta)

print("BiLSTM predictions")
print(y_pred_bilstm)

BERT predictions
0       8
1       8
2       0
3       0
4       0
       ..
2475    1
2476    7
2477    0
2478    5
2479    5
Name: y_pred, Length: 2480, dtype: int64
BiLSTM predictions
0       2
1       0
2       1
3       1
4       1
       ..
2475    0
2476    7
2477    0
2478    5
2479    5
Name: y_pred, Length: 2480, dtype: int64


In [18]:
# contingency table agreement template
contingency_table = pd.DataFrame(
    [['n_11', 'n_10'],
     ['n_01', 'n_00']],
    index=['RoBERTa Correct', 'RoBERTa Wrong'],
    columns=['BiLSTM Correct', 'BiLSTM Wrong']
)

contingency_table

Unnamed: 0,BiLSTM Correct,BiLSTM Wrong
RoBERTa Correct,n_11,n_10
RoBERTa Wrong,n_01,n_00


In [19]:
n_11 = np.sum((y_pred_roberta == y_true) & (y_pred_bilstm == y_true))
n_10 = np.sum((y_pred_roberta == y_true) & (y_pred_bilstm != y_true))
n_01 = np.sum((y_pred_roberta != y_true) & (y_pred_bilstm == y_true))
n_00 = np.sum((y_pred_roberta != y_true) & (y_pred_bilstm != y_true))

table = np.array([[n_11, n_10],
                  [n_01, n_00]])

print(table)

mcnemar_result = mcnemar(table, exact=False)

print(mcnemar_result)

[[1470  493]
 [ 167  350]]
pvalue      1.1101260240194846e-36
statistic   160.03787878787878


In [20]:
roberta_bilstm_odds_ratio = n_10 / n_01

print("Effect size (odds ratio) between RoBERTa and BiLSTM:", roberta_bilstm_odds_ratio)

Effect size (odds ratio) between RoBERTa and BiLSTM: 2.9520958083832336


In [21]:
SE = math.sqrt(1/n_10 + 1/n_01)
lower_ci_roberta_bilstm = math.exp(math.log(roberta_bilstm_odds_ratio) - 1.96 * SE)
upper_ci_roberta_bilstm = math.exp(math.log(roberta_bilstm_odds_ratio) + 1.96 * SE)

print(f"95% CI for odds ratio between RoBERTa and BiLSTM: ({lower_ci_roberta_bilstm}, {upper_ci_roberta_bilstm})")

95% CI for odds ratio between RoBERTa and BiLSTM: (2.476949587943646, 3.518387981851867)


# Computational performance results for training

In [22]:
# traning of computational performance

training_bilstm_computational_performance = {"tag": "Train BiLSTM", "wall_time_sec": 12.607763934999639, "cpu_user_sec": 14.98, "cpu_system_sec": 2.21, "ram_delta_mb": 714.87890625, "gpu_peak_mem_mb": 659.55126953125}
training_bert_computational_performance = {"tag": "TRAINING - BERT - OPP115", "wall_time_sec": 2156.2268697129994, "cpu_user_sec": 2144.84, "cpu_system_sec": 4.52, "ram_delta_mb": 1324.76171875, "gpu_peak_mem_mb": 7489.453125}
training_roberta_computational_performance = {"tag": "TRAINING - RoBERTa - OPP115", "wall_time_sec": 2161.9818342470007, "cpu_user_sec": 2150.05, "cpu_system_sec": 5.79, "ram_delta_mb": 1342.984375, "gpu_peak_mem_mb": 7664.5234375}

training_bert_cpu_total = training_bert_computational_performance['cpu_user_sec'] + training_bert_computational_performance['cpu_system_sec']
training_roberta_cpu_total = training_roberta_computational_performance['cpu_user_sec'] + training_roberta_computational_performance['cpu_system_sec']
training_bilstm_cpu_total = training_bilstm_computational_performance['cpu_user_sec'] + training_bilstm_computational_performance['cpu_system_sec']

print("BERT training CPU total seconds: ", training_bert_cpu_total)
print("RoBERTa training CPU total seconds: ", training_roberta_cpu_total)
print("BiLSTM training CPU total seconds: ", training_bilstm_cpu_total)


BERT training CPU total seconds:  2149.36
RoBERTa training CPU total seconds:  2155.84
BiLSTM training CPU total seconds:  17.19
