In [16]:
import pandas as pd
import os
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [75]:
# ["AWPNLI", "RedditNLI", "StressTest", "RTE_Quant", "NewsNLI"]
dataset = "RTE_Quant"
root_path = os.path.dirname(os.path.dirname(os.getcwd()))
results_path = os.path.join(root_path, "data", "equate_labelled", f"{dataset}_phase2.csv")
os.makedirs(os.path.join(root_path, "data", "equate_labelled", "processed"), exist_ok=True)

In [76]:
res = pd.read_csv(results_path)
res.head()

Unnamed: 0,sample_index,generated_label,error_message,golden_label,premise,hypothesis
0,575,entailment,,neutral,"Peter Siebold, 43, was identified as the pilot...",Pilot who survived identified as Peter Siebold...
1,914,contradiction,,neutral,Holder said the Justice Department is working ...,Ten Mexican nationals have been charged and se...
2,450,entailment,,entailment,The nine were brought into Somali waters and t...,"Nine people were brought into Somali waters, h..."
3,860,neutral,,neutral,There are some additional 863 cancer cases amo...,"There are at least 2,509 certified cancer case..."
4,831,neutral,,neutral,The survey said 57% supported it going forward...,New CNN/ORC survey shows 57% of Americans supp...


In [77]:
res.shape

(958, 6)

In [78]:
res[res.duplicated(subset=["sample_index"])].shape[0]

0

In [79]:
# res = res.drop_duplicates(subset=['sample_index'], keep='first', ignore_index=True)
# res.to_csv(results_path, index=False)
# res.shape

Do we have scripts that threw an error?

In [80]:
res[res['error_message'].notna()].shape[0]

0

In [81]:
list(res[res['error_message'].notna()]["sample_index"].values)

[]

Filter out erroneous scripts, if any

In [82]:
valid_res = res[res["error_message"].isna()]

In [83]:
valid_res["golden_label"].value_counts(normalize=True)

golden_label
entailment    0.504175
neutral       0.495825
Name: proportion, dtype: float64

In [84]:
valid_res["generated_label"].value_counts()

generated_label
entailment       521
neutral          382
contradiction     55
Name: count, dtype: int64

In [85]:
if dataset == "AWPNLI":
    valid_res = valid_res[valid_res["generated_label"]!="neutral"]

In [86]:
# valid_res[valid_res["generated_label"]=="contradiction"].to_excel("NewsNLI_predicted_contradiction.xlsx")

In [87]:
# # AWPNLI baselines
# mb_baseline_accuracy = 0.50
# rsb_baseline_accuracy = 0.50
# rsb_baseline_f1_c = 0.5
# rsb_baseline_f1_e = 0.5
#
# # RedditNLI baselines
# mb_baseline_accuracy = 57.89
# rsb_baseline_accuracy = 45.94
# rsb_baseline_f1_e = 57.89
# rsb_baseline_f1_n = 34.41
# rsb_baseline_f1_n = 7.69
#
# # StressTest baselines
# mb_baseline_accuracy = 0.33
# rsb_baseline_accuracy = 0.33
# rsb_baseline_f1_c = 0.33
# rsb_baseline_f1_e = 0.33
# rsb_baseline_f1_n = 0.33
#
# # NewsNLI baselines
# mb_baseline_accuracy = 0
# rsb_baseline_accuracy = 0
# rsb_baseline_f1_c = 0
# rsb_baseline_f1_e = 0
#
# # RTE_Quant baselines
# mb_baseline_accuracy = 0
# rsb_baseline_accuracy = 0
# rsb_baseline_f1_c = 0
# rsb_baseline_f1_e = 0

In [88]:
if dataset in ["RTE_Quant", "NewsNLI"]:
    # Creating masks based on conditions
    convert_contradiction_to_neutral_mask = valid_res["generated_label"] == "contradiction"

    # Correctly assigning a new value using the combined mask with .loc
    valid_res.loc[convert_contradiction_to_neutral_mask, "generated_label"] = "neutral"

In [89]:
valid_res[valid_res['golden_label'] == valid_res['generated_label']].shape[0]

696

In [90]:
accuracy_score(y_true=valid_res['golden_label'], y_pred=valid_res['generated_label'])

0.7265135699373695

In [74]:
print(classification_report(y_true=valid_res['golden_label'], y_pred=valid_res['generated_label']))

              precision    recall  f1-score   support

  entailment       0.75      0.96      0.84        68
     neutral       0.96      0.77      0.85        94

    accuracy                           0.85       162
   macro avg       0.85      0.86      0.85       162
weighted avg       0.87      0.85      0.85       162



In [285]:
valid_res = valid_res[["sample_index", "premise", "hypothesis", "golden_label", "generated_label"]].rename(columns={"generated_label": "reference_label"})
valid_res.to_csv(os.path.join(root_path, "data", "equate_labelled", "processed", f"{dataset}.csv"), index=False)

## Only for RTE_Quant and NEWSNLI

In [229]:
# valid_res[valid_res["generated_label"]=="contradiction"][["sample_index", "premise", "hypothesis"]].to_excel("NewsNLI_predicted_contradiction.xlsx")

# target label is neutral, model labels as contradiction and manual inspection indicates contradiction as well  (only applicable to NewsNLI and RTE_Quant, which treat "contradiction" case as neutral)
news_nli_corrected_labels_indices = [914, 736, 663, 749, 838, 605]
mask_news = valid_res['sample_index'].isin(news_nli_corrected_labels_indices)

rte_quant_corrected_labels_indices = [118, 128, 98, 7, 21, 54, 147, 20, 30, 155, 52, 95, 120, 72, 150, 57, 111, 26, 115, 58, 4, 18]
mask_rte = valid_res['sample_index'].isin(rte_quant_corrected_labels_indices)

valid_res['corrected_golden_label'] = valid_res["golden_label"]
valid_res.loc[mask_news, 'corrected_golden_label'] = 'contradiction'
valid_res["corrected_golden_label"].value_counts()

corrected_golden_label
neutral          72
entailment       68
contradiction    22
Name: count, dtype: int64

#### First consider the case when we correct the reference label through manual verification. Now the dataset is framed as a 3-way decision.

In [230]:
valid_res["corrected_golden_label"].value_counts(normalize=True)

corrected_golden_label
neutral          0.444444
entailment       0.419753
contradiction    0.135802
Name: proportion, dtype: float64

In [234]:
valid_res[valid_res['corrected_golden_label'] == valid_res['generated_label']].shape[0]

134

In [231]:
accuracy_score(y_true=valid_res['corrected_golden_label'], y_pred=valid_res['generated_label'])

0.8271604938271605

In [235]:
print(classification_report(y_true=valid_res['corrected_golden_label'], y_pred=valid_res['generated_label']))

               precision    recall  f1-score   support

contradiction       0.85      1.00      0.92        22
   entailment       0.75      0.96      0.84        68
      neutral       0.96      0.65      0.78        72

     accuracy                           0.83       162
    macro avg       0.85      0.87      0.84       162
 weighted avg       0.85      0.83      0.82       162



#### Now consider the case when we consider "contradiction" as neutral label, to remain consistent with the results from other papers

In [296]:
valid_res["modified_label"] = valid_res["generated_label"].apply(lambda label: label if label == "entailment" else "neutral")

In [253]:
valid_res['golden_label'].value_counts(normalize=True)

golden_label
entailment    0.504175
neutral       0.495825
Name: proportion, dtype: float64

In [254]:
valid_res[valid_res['golden_label'] == valid_res['modified_label']].shape[0]

696

In [255]:
accuracy_score(y_true=valid_res['golden_label'], y_pred=valid_res['modified_label'])

0.7265135699373695

In [256]:
print(classification_report(y_true=valid_res['golden_label'], y_pred=valid_res['modified_label']))

              precision    recall  f1-score   support

  entailment       0.71      0.77      0.74       483
     neutral       0.74      0.68      0.71       475

    accuracy                           0.73       958
   macro avg       0.73      0.73      0.73       958
weighted avg       0.73      0.73      0.73       958



In [297]:
save_data = valid_res[["sample_index", "premise", "hypothesis", "golden_label", "modified_label"]].rename(columns={"modified_label": "reference_label"})

save_data.to_csv(os.path.join(root_path, "data", "equate_labelled", "processed", f"{dataset}.csv"), index=False)

In [111]:
valid_res.sort_values(by="sample_index").head(10)

Unnamed: 0,sample_index,label,error_message,golden_label
134,0,neutral,,neutral
148,1,entailment,,neutral
8,2,entailment,,neutral
11,3,entailment,,entailment
140,4,neutral,,neutral
145,5,entailment,,neutral
5,6,entailment,,entailment
17,7,neutral,,neutral
39,8,entailment,,entailment
54,9,entailment,,entailment


In [112]:
misclassified_samples_indices = sorted(valid_res[valid_res["generated_label"] != valid_res["golden_label"]]["sample_index"].unique())
misclassified_samples_indices

[1,
 2,
 5,
 10,
 20,
 26,
 31,
 37,
 39,
 42,
 47,
 54,
 55,
 65,
 66,
 77,
 81,
 82,
 87,
 89,
 92,
 93,
 94,
 103,
 106,
 109,
 110,
 139,
 142,
 151,
 162]

AWPNLI:
approx issues: 58, 60, 500, 602, 614, 648, 686
ambiguity: 62, 82, 304, 305, 482
wrong label in equate: 107, 106, 109, 138, 550, 674
RedditNLI:
not sure how to correct: 24,
wrong label: 36
RTE_Quant:
wrong label: 26, 31
not sure how to correct: 142

In [113]:
len(misclassified_samples_indices)

31

In [155]:
scripts_path = os.path.join(root_path, "data", "generated", dataset, "script_with_cot_vars_first")
sample_indices = []
for script_file in os.listdir(scripts_path):
    if script_file.endswith(".py"):
        with open(os.path.join(scripts_path, script_file), 'r') as f:
            idx = int(script_file.split(".")[0].split("_")[-1])
            lines = f.readlines()
            inputs = "\n".join(lines[:3]).strip()
            # script = "\n".join(lines)
            if "more than" in inputs or "less than" in inputs:
                sample_indices.append(idx)
sorted(sample_indices)

[0,
 1,
 2,
 3,
 4,
 6,
 7,
 8,
 9,
 10,
 12,
 13,
 14,
 15,
 16,
 18,
 19,
 20,
 21,
 22,
 24,
 26,
 28,
 29,
 30,
 32,
 34,
 35,
 36,
 38,
 39,
 40,
 41,
 42,
 44,
 46,
 47,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 60,
 62,
 64,
 66,
 67,
 68,
 70,
 71,
 72,
 73,
 74,
 76,
 78,
 79,
 80,
 82,
 84,
 86,
 88,
 90,
 92,
 93,
 94,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 106,
 108,
 110,
 111,
 112,
 113,
 116,
 118,
 119,
 120,
 121,
 123,
 124,
 126,
 128,
 130,
 132,
 133,
 134,
 135,
 136,
 138,
 139,
 140,
 142,
 143,
 144,
 146,
 148,
 149,
 150,
 152,
 154,
 155,
 156,
 157,
 158,
 160,
 161,
 162,
 164,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 176,
 177,
 178,
 180,
 182,
 183,
 184,
 186,
 187,
 188,
 190,
 192,
 194,
 195,
 196,
 197,
 198,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 208,
 209,
 210,
 212,
 214,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 224,
 226,
 227,
 228,
 230,
 231,
 232,
 234,
 236,
 237,
 238,
 240,
 242,
 243,
 244,
 245,
 2

In [158]:
misclassified_with_quantifier = [idx for idx in misclassified_samples_indices if idx in sample_indices]
len(misclassified_with_quantifier)

130

In [159]:
misclassified_with_quantifier

[1,
 7,
 13,
 15,
 21,
 29,
 35,
 39,
 47,
 55,
 57,
 67,
 68,
 73,
 93,
 96,
 97,
 99,
 101,
 103,
 111,
 113,
 119,
 123,
 124,
 126,
 133,
 135,
 143,
 149,
 155,
 161,
 171,
 173,
 177,
 183,
 190,
 195,
 197,
 203,
 205,
 209,
 217,
 221,
 227,
 237,
 243,
 249,
 259,
 267,
 269,
 271,
 273,
 281,
 283,
 287,
 295,
 311,
 321,
 325,
 327,
 331,
 335,
 337,
 340,
 347,
 368,
 371,
 373,
 383,
 385,
 389,
 392,
 393,
 405,
 415,
 427,
 431,
 439,
 451,
 452,
 473,
 477,
 489,
 491,
 495,
 499,
 501,
 505,
 506,
 509,
 511,
 525,
 527,
 533,
 549,
 577,
 579,
 585,
 589,
 595,
 598,
 601,
 602,
 605,
 611,
 613,
 623,
 629,
 641,
 647,
 649,
 653,
 661,
 663,
 669,
 680,
 683,
 685,
 691,
 693,
 699,
 1168,
 1483,
 2386,
 2727,
 3937,
 4319,
 4873,
 4945]

In [20]:
root_path = os.path.dirname(os.path.dirname(os.getcwd()))
equate = pd.DataFrame()

for dataset in ["AWPNLI", "RedditNLI", "StressTest", "RTE_Quant", "NewsNLI"]:
    print(dataset)
    results_path = os.path.join(root_path, "data", "equate_labelled", f"{dataset}_phase2.csv")
    df = pd.read_csv(results_path)
    # print(df.shape[0])
    if dataset in ["RTE_Quant", "NewsNLI"]:
        # Creating masks based on conditions
        convert_contradiction_to_neutral_mask = df["generated_label"] == "contradiction"
        # Correctly assigning a new value using the combined mask with .loc
        df.loc[convert_contradiction_to_neutral_mask, "generated_label"] = "neutral"
    # print(df[df["golden_label"] == df["generated_label"]].shape[0])
    # print("Acc:", round(accuracy_score(df["golden_label"], df["generated_label"]), 3))
    print("w-F1:", round(f1_score(df["golden_label"], df["generated_label"], average="micro"), 4))
    equate = pd.concat([equate, df], ignore_index=True)

print(equate.shape[0])
print("EQUATE")
print(round(accuracy_score(equate["golden_label"], equate["generated_label"]), 3))

AWPNLI
w-F1: 0.9584
RedditNLI
w-F1: 0.7085
StressTest
w-F1: 0.6705
RTE_Quant
w-F1: 0.8457
NewsNLI
w-F1: 0.7265
9036
EQUATE
0.704


In [15]:
692+175+4658+137+696

6358

In [6]:
equate[equate["golden_label"] == equate["generated_label"]].shape[0]

6358

In [5]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(equate["golden_label"], equate["generated_label"])



0.7052759676093809