In [786]:
import os
import traceback

import numpy as np
import pandas as pd
import re
from rouge_score import rouge_scorer
from typing import Optional
from sklearn.metrics import classification_report

In [787]:
results_overview = dict()

In [788]:
root_path = os.path.dirname(os.path.dirname(os.getcwd()))

In [789]:
job_id = "5508387_finetuned"
model_name = "llama-2-chat"
model_size = "7B"
results_path = os.path.join(root_path, "results_inference", model_name, model_size, job_id)
test_set = pd.read_csv(os.path.join(results_path, "test.csv"))
print(test_set.shape[0])
print(test_set.columns)

1738
Index(['sample_index', 'completion', 'prompt', 'source', 'input_prompt',
       'inferred_completion'],
      dtype='object')


In [790]:
def has_python_script(text: str) -> bool:
    """There is a complete or incomplete script in the LLM completion"""
    return (not pd.isna(text)) and ("```python" in text or "```" in text)

def has_non_python_script(completion: str):
    """CodeLlama gives Golang scripts, detect those by some common keywords"""
    return (not pd.isna(completion)) and ("package main" in completion or "Println" in completion or ":=" in completion)

def extract_script(text: str) -> Optional[str | None]:
    """Extract Python scripts from the completion"""
    if pd.isna(text):
        return np.nan
    code_extraction_regex, code_extraction_regex_backup = r"```python([^`]+)```", r"```([^`]+)```"
    scripts = re.findall(code_extraction_regex, text)
    if not scripts:
        # rare case: max_tokens reached before "```" indicator is completely returned by LLM
        scripts = re.findall(code_extraction_regex_backup, text)
        if not scripts:
            return np.nan
    return scripts[0].replace("`", "").replace("python", "")  # "\n".join(scripts)

### Any empty completions?

In [791]:
empty_completions = test_set[(test_set["inferred_completion"].isna()) | (test_set["inferred_completion"].isin(["", " "]))].shape[0]
results_overview["empty_completions"] = empty_completions
empty_completions

0

### How many completions with Python scripts?

In [792]:
test_set["has_script"] = test_set["inferred_completion"].apply(lambda completion: has_python_script(completion))

python_scripts = test_set[test_set["has_script"] == True].shape[0]
results_overview["python_scripts"] = python_scripts
python_scripts

1738

### How many completions with Golang scripts?

In [793]:
test_set["has_non_python_script"] = test_set["inferred_completion"].apply(lambda completion: has_non_python_script(completion))

non_python_scripts = test_set[test_set["has_non_python_script"] == True].shape[0]
results_overview["non_python_scripts"] = non_python_scripts
non_python_scripts

0

### Check there cannot be both Python and non-Python scripts in one completion

In [794]:
test_set[(test_set["has_script"]==True) & (test_set["has_non_python_script"]==True)].shape[0]

0

#### Per dataset, what is the split between generated (complete or not) scripts and missing scripts?

In [795]:
test_set.groupby(["source", "has_script"]).count()["sample_index"]

source      has_script
awpnli      True           180
newsnli     True           242
redditnli   True            62
rtequant    True            41
stresstest  True          1213
Name: sample_index, dtype: int64

#### Inspect some of the identified Python scripts

In [796]:
test_set["inferred_script"] = test_set["inferred_completion"].apply(lambda completion: extract_script(completion))

In [797]:
test_set[test_set["has_script"] == True][["completion", "inferred_completion", "inferred_script"]].head(5)

Unnamed: 0,completion,inferred_completion,inferred_script
0,```python\nmax_gloves_premise = 60\ngloves_hyp...,\n```python\nmax_gloves_premise = 60\ngloves_h...,\nmax_gloves_premise = 60\ngloves_hypothesis =...
1,```python\nstretch_time_premise = 10\nmax_stre...,\n```python\nstop_time_premise = 10\nmax_stop_...,\nstop_time_premise = 10\nmax_stop_time_hypoth...
2,```python\nincome_spend_on_petrol_premise = 0....,```python\npetrol_expense_premise = 30\npetrol...,\npetrol_expense_premise = 30\npetrol_expense_...
3,```python\nmin_boys_premise = 2\nboys_hypothes...,\n```python\nmin_boys_premise = 2\nboys_hypoth...,\nmin_boys_premise = 2\nboys_hypothesis = 6\ng...
4,```python\nbutter_kg_premise = 27\nbutter_kg_h...,\n```python\nbutter_weight_premise = 27\nbutte...,\nbutter_weight_premise = 27\nbutter_weight_hy...


### We notice that some scripts only return a boolean value, but there is no reasoning/math logic to infer it, so these scripts should be skipped during the evaluation

In [798]:
def returns_only_label(script: str):
    """Detect Python scripts that return directly a label, with no reasoning"""
    if pd.isna(script):
        return False
    script = script.replace("\n", "").strip()
    return script == "True" or script == "False"

In [799]:
test_set["only_label"] = test_set["inferred_script"].apply(lambda script: returns_only_label(script))
label_only = test_set[test_set["only_label"] == True].shape[0]
results_overview["label_only"] = label_only
label_only

0

#### For how many samples the `max_new_tokens` limit was exceeded before the full script could not be outputted?

In [800]:
def incomplete_script(has_script: bool, script: Optional[str | None]) -> bool:
    return has_script and pd.isna(script)

test_set["tokens_exceeded"] = test_set.apply(lambda row: incomplete_script(row["has_script"], row["inferred_script"]), axis=1)
tokens_exceeded = test_set[test_set["tokens_exceeded"]==True].shape[0]
results_overview["tokens_exceeded"] = tokens_exceeded
tokens_exceeded

69

In [801]:
test_set["expected_script"] = test_set["completion"].str.replace("```", "")
test_set["expected_script"] = test_set["expected_script"].str.replace("python", "")

### COMPUTE SIMILARITY BETWEEN TARGET AND INFERRED COMPLETION

In [802]:
# Your generated text and the reference text
generated_text = "The weather outside is so good today!"
reference_text = "Today we have such good weather outside!"

#### ROUGE metric

In [803]:
# Create a scorer object
scorer_1 = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scorer_2 = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
scorer_L = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def rouge1_score(reference_text: str, generated_text: str, compute_score: bool):
    if not (pd.isna(reference_text) or pd.isna(generated_text)) and compute_score:
        return round(scorer_1.score(reference_text, generated_text)['rouge1'].fmeasure, 4)
    return np.nan

def rouge2_score(reference_text: str, generated_text: str, compute_score: bool):
    if not (pd.isna(reference_text) or pd.isna(generated_text)) and compute_score:
        return round(scorer_2.score(reference_text, generated_text)['rouge2'].fmeasure, 4)
    return np.nan

def rougeL_score(reference_text: str, generated_text: str, compute_score: bool):
    if not (pd.isna(reference_text) or pd.isna(generated_text)) and compute_score:
        return round(scorer_L.score(reference_text, generated_text)['rougeL'].fmeasure, 4)
    return np.nan

In [804]:
print(rouge1_score(reference_text, generated_text, True))
print(rouge2_score(reference_text, generated_text, True))
print(rougeL_score(reference_text, generated_text, True))

0.5714
0.1667
0.2857


#### METEOR metric

In [805]:
import nltk
from nltk.translate.meteor_score import single_meteor_score

# Ensure that NLTK's tokenizers and taggers are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

def meteor_score(reference_text: str, generated_text: str, compute_score: bool):
    if not (pd.isna(reference_text) or pd.isna(generated_text)) and compute_score:
        return round(single_meteor_score(reference_text.split(" "), generated_text.split(" ")), 4)
    return np.nan

print(meteor_score(reference_text, generated_text, True))

0.1429


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ioanamazilu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ioanamazilu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [806]:
# import nltk
# from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
# from tree_sitter import Language, Parser
#
# # Load your programming language grammar, assuming Python here
# # Language.build_library(
# #   'build/my-languages.so',
# #   ['python']
# # )
# PY_LANGUAGE = Language('parser/my-languages.so', 'python')
# parser = Parser()
# parser.set_language(PY_LANGUAGE)
#
# def get_tokens(code):
#     """Extract tokens from source code using tree-sitter."""
#     tree = parser.parse(bytes(code, "utf8"))
#     root_node = tree.root_node
#     tokens = [node.text.decode('utf8') for node in root_node.walk() if node.type == 'identifier']
#     return tokens
#
# # Example codes
# generated_code = "def add(a, b):\n    return a + b"
# reference_code = ["def add(num1, num2):\n    return num1 + num2"]
#
# # Tokenize codes
# tokenized_generated_code = get_tokens(generated_code)
# tokenized_reference_code = [get_tokens(ref) for ref in reference_code]
#
# # Compute BLEU score
# chencherry = SmoothingFunction()
# bleu_score = corpus_bleu(tokenized_reference_code, [tokenized_generated_code], smoothing_function=chencherry.method1)
#
# print(f"CodeBLEU (simplified, BLEU part only) score: {bleu_score}")

#### CHRF score

In [807]:
import nltk
from nltk.translate.chrf_score import sentence_chrf

# Ensure that NLTK's tokenizers and data are available
nltk.download('punkt')

def chrf_score(reference_text: str, generated_text: str, compute_score: bool):
    if not (pd.isna(reference_text) or pd.isna(generated_text)) and compute_score:
        return round(sentence_chrf(reference_text, generated_text), 4)
    return np.nan

print(f"CHRF score: {chrf_score(reference_text, generated_text, True)}")

CHRF score: 0.4935


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ioanamazilu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Compare inferred and expected scripts only for valid Python scripts

In [808]:
def evaluate_script(row):
    """Evaluate a script only if it is a complete Python script that does not return a label directly """
    return not row["tokens_exceeded"] and not row["only_label"] and row["has_script"]

test_set["evaluate_script"] = test_set.apply(lambda row: evaluate_script(row), axis=1)
evaluated_scripts = test_set[test_set["evaluate_script"]==True].shape[0]
results_overview["evaluated_scripts"] = evaluated_scripts
evaluated_scripts

1669

In [809]:
test_set["rouge1_score"] = test_set.apply(lambda row: rouge1_score(row["expected_script"], row["inferred_script"], row["evaluate_script"]), axis=1)
test_set["rouge2_score"] = test_set.apply(lambda row: rouge2_score(row["expected_script"], row["inferred_script"], row["evaluate_script"]), axis=1)
test_set["rougeL_score"] = test_set.apply(lambda row: rougeL_score(row["expected_script"], row["inferred_script"], row["evaluate_script"]), axis=1)
test_set["meteor_score"] = test_set.apply(lambda row: meteor_score(row["expected_script"], row["inferred_script"], row["evaluate_script"]), axis=1)
test_set["chrf_score"] = test_set.apply(lambda row: chrf_score(row["expected_script"], row["inferred_script"], row["evaluate_script"]), axis=1)

In [810]:
score_columns = [col for col in test_set.columns if "_score" in col]
test_set[score_columns].aggregate(["mean", "std"])

Unnamed: 0,rouge1_score,rouge2_score,rougeL_score,meteor_score,chrf_score
mean,0.775716,0.598265,0.713113,0.664522,0.74674
std,0.12579,0.184464,0.152376,0.164142,0.139202


## Write the inferred scripts to python files for QNLI classification evaluation & code quality evaluation

In [811]:
test_set["evaluate_script"].value_counts()

evaluate_script
True     1669
False      69
Name: count, dtype: int64

In [812]:
dataset_name_mapping = {
    "stresstest": "StressTest",
    "rtequant": "RTE_Quant",
    "awpnli": "AWPNLI",
    "newsnli": "NewsNLI",
    "redditnli": "RedditNLI"
}

for _, dataset in dataset_name_mapping.items():
    os.makedirs(os.path.join(results_path, "scripts", dataset), exist_ok=True)

def write_script_to_file(sample_index: str, source_dataset: str, script: Optional[str | None], evaluate_script: bool):
    if not evaluate_script:
        return False
    try:
        with open(os.path.join(results_path, "scripts", dataset_name_mapping[source_dataset], f"sample_{sample_index}.py"), 'x') as f:
            f.write(script)
        return True
    except:
        print(traceback.print_exc())
        return False

In [813]:
test_set["script_to_file"] = test_set.apply(lambda row: write_script_to_file(row["sample_index"], row["source"], row["inferred_script"], row["evaluate_script"]), axis=1)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


Traceback (most recent call last):
  File "/var/folders/gz/wcv7hlzn32s2rb3n3jn8cscm0000gn/T/ipykernel_36094/698810159.py", line 16, in write_script_to_file
    with open(os.path.join(results_path, "scripts", dataset_name_mapping[source_dataset], f"sample_{sample_index}.py"), 'x') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ioanamazilu/PycharmProjects/quant_nli/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 286, in _modified_open
    return io_open(file, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileExistsError: [Errno 17] File exists: '/Users/ioanamazilu/PycharmProjects/quant_nli/results_inference/llama-2-chat/7B/5508387_finetuned/scripts/StressTest/sample_2266.py'
Traceback (most recent call last):
  File "/var/folders/gz/wcv7hlzn32s2rb3n3jn8cscm0000gn/T/ipykernel_36094/698810159.py", line 16, in write_script_to_file
    with open(os.path.jo

In [814]:
test_set[test_set["script_to_file"] == True].shape[0]

0

In [815]:
from src.utils import run_script
from src.dataset_validation import validate_code_quality

count = 0
for idx, row in test_set.iterrows():
    print(idx)
    qualified_for_classification = getattr(row, "evaluate_script")
    if qualified_for_classification:
        count += 1
        sample_index, dataset_key = getattr(row, "sample_index"), getattr(row, "source")
        dataset = dataset_name_mapping[dataset_key]
        script_path = os.path.join(results_path, "scripts", dataset, f"sample_{sample_index}.py")
        label, error_message = run_script(script_path)
        test_set.loc[idx, "inferred_label"] = label
        test_set.loc[idx, "error_message"] = error_message

        code_quality_scores, code_quality_resolutions = validate_code_quality(script_path)
        for score_key, score in code_quality_scores.items():
            test_set.loc[idx, score_key] = score
        test_set.loc[idx, "cc_resolutions"] = "; ".join(code_quality_resolutions)

print(f"Evaluated {count} scripts.")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [816]:
labels_obtained = test_set[test_set["inferred_label"].isin(["entailment", "contradiction", "neutral"])].shape[0]
results_overview["labels_obtained"] = labels_obtained
labels_obtained

1657

In [817]:
code_quality_scores = ['readability', 'document_size', 'redundancy_check', 'function_size']

test_set[code_quality_scores].aggregate(["mean", "std"])

Unnamed: 0,readability,document_size,redundancy_check,function_size
mean,93.354769,56.251648,84.865542,50.029958
std,10.680941,10.154865,1.841865,1.223888


## View results overview

In [818]:
results_overview

{'empty_completions': 0,
 'python_scripts': 1738,
 'non_python_scripts': 0,
 'label_only': 0,
 'tokens_exceeded': 69,
 'evaluated_scripts': 1669,
 'labels_obtained': 1657}

## QNLI results evaluation

In [819]:
# merged = pd.DataFrame()
# for dataset_key, dataset in dataset_name_mapping.items():
#     print(dataset)
#     df = pd.read_csv(os.path.join(root_path, "data", "equate_labelled", f"cleaned_{dataset}_gpt4.csv"))
#     print(f"All samples: {df.shape[0]}")
#     df["source"] = dataset_key
#     test_indices = test_set[test_set["source"]==dataset_key]["sample_index"].unique()
#     print(f"Test samples: {len(test_indices)}")
#     test_subset = df[df["sample_index"].isin(test_indices)]
#     if merged.shape[0] == 0:
#         merged = test_subset
#     else:
#         merged = pd.concat([merged, test_subset], ignore_index=True)
#
# merged.shape

In [820]:
# merged.columns

In [821]:
# merged.to_csv(os.path.join(root_path, "data", "finetuning", "test_all_with_labels.csv"), index=False)

In [822]:
gt_data = pd.read_csv(os.path.join(root_path, "data", "finetuning", "test_all_with_labels.csv"))
gt_data.shape

(1738, 9)

In [823]:
qnli_results_df = pd.merge(test_set, gt_data, on=["sample_index", "source"], how="inner")
qnli_results_df.shape[0]

1738

In [824]:
correct_predictions = qnli_results_df[qnli_results_df["golden_label"] == qnli_results_df["inferred_label"]]

print(f"Score computed based on {correct_predictions.shape[0]} samples")
accuracy = correct_predictions.shape[0] / qnli_results_df.shape[0]
print(f"Accuracy: {accuracy}")

Score computed based on 1360 samples
Accuracy: 0.7825086306098964


In [825]:
correct_predictions = qnli_results_df[qnli_results_df["golden_label"] == qnli_results_df["generated_label"]]

print(f"Score computed based on {correct_predictions.shape[0]} samples")
accuracy = correct_predictions.shape[0] / qnli_results_df.shape[0]
print(f"Accuracy: {accuracy}")

Score computed based on 1218 samples
Accuracy: 0.7008055235903338


In [826]:
qnli_results_df["inferred_label"].value_counts()

inferred_label
entailment       650
contradiction    575
neutral          432
nan               69
error             12
Name: count, dtype: int64

In [827]:
qnli_results_df_filtered = qnli_results_df[qnli_results_df["inferred_label"].isin(["entailment", "contradiction", "neutral"])]
qnli_results_df_filtered.shape[0]

1657

In [828]:
from sklearn.metrics import accuracy_score, classification_report

print(f"Accuracy (fine-tuned model): {accuracy_score(qnli_results_df_filtered['golden_label'], qnli_results_df_filtered['inferred_label'])}")

print(f"Accuracy (GPT-generated data): {accuracy_score(qnli_results_df_filtered['golden_label'], qnli_results_df_filtered['generated_label'])}")

Accuracy (fine-tuned model): 0.38181818181818183
Accuracy (GPT-generated data): 0.8181818181818182
Accuracy (fine-tuned model): 0.8207604103802052
Accuracy (GPT-generated data): 0.7103198551599276


In [829]:
print(classification_report(qnli_results_df_filtered['golden_label'], qnli_results_df_filtered['inferred_label']))

               precision    recall  f1-score   support

contradiction       0.27      0.21      0.24        14
   entailment       0.45      0.54      0.49        24
      neutral       0.33      0.29      0.31        17

     accuracy                           0.38        55
    macro avg       0.35      0.35      0.35        55
 weighted avg       0.37      0.38      0.37        55

               precision    recall  f1-score   support

contradiction       0.83      0.90      0.86       533
   entailment       0.79      0.83      0.81       619
      neutral       0.85      0.73      0.79       505

     accuracy                           0.82      1657
    macro avg       0.82      0.82      0.82      1657
 weighted avg       0.82      0.82      0.82      1657



In [830]:
print(classification_report(qnli_results_df_filtered['golden_label'], qnli_results_df_filtered['generated_label']))

               precision    recall  f1-score   support

contradiction       1.00      0.79      0.88        14
   entailment       0.83      0.83      0.83        24
      neutral       0.70      0.82      0.76        17

     accuracy                           0.82        55
    macro avg       0.84      0.81      0.82        55
 weighted avg       0.83      0.82      0.82        55

               precision    recall  f1-score   support

contradiction       0.88      0.71      0.79       533
   entailment       0.66      0.83      0.74       619
        error       0.00      0.00      0.00         0
      neutral       0.64      0.56      0.60       505

     accuracy                           0.71      1657
    macro avg       0.54      0.53      0.53      1657
 weighted avg       0.72      0.71      0.71      1657



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [831]:
for dataset_key, dataset in dataset_name_mapping.items():
    print(dataset)
    filtered_df = qnli_results_df_filtered[qnli_results_df_filtered["source"] == dataset_key]

    print(f"Accuracy (fine-tuned model): {accuracy_score(filtered_df['golden_label'], filtered_df['inferred_label'])}")

    print(f"Accuracy (GPT-4 data): {accuracy_score(filtered_df['golden_label'], filtered_df['generated_label'])}")

StressTest
Accuracy (fine-tuned model): 0.25
Accuracy (GPT-4 data): 0.7916666666666666
RTE_Quant
Accuracy (fine-tuned model): 0.4
Accuracy (GPT-4 data): 1.0
AWPNLI
Accuracy (fine-tuned model): 0.3333333333333333
Accuracy (GPT-4 data): 1.0
NewsNLI
Accuracy (fine-tuned model): 0.6428571428571429
Accuracy (GPT-4 data): 0.6428571428571429
RedditNLI
Accuracy (fine-tuned model): 0.0
Accuracy (GPT-4 data): 1.0
StressTest
Accuracy (fine-tuned model): 0.8625550660792951
Accuracy (GPT-4 data): 0.677533039647577
RTE_Quant
Accuracy (fine-tuned model): 0.7317073170731707
Accuracy (GPT-4 data): 0.926829268292683
AWPNLI
Accuracy (fine-tuned model): 0.8944444444444445
Accuracy (GPT-4 data): 0.9277777777777778
NewsNLI
Accuracy (fine-tuned model): 0.6443514644351465
Accuracy (GPT-4 data): 0.6736401673640168
RedditNLI
Accuracy (fine-tuned model): 0.5806451612903226
Accuracy (GPT-4 data): 0.6774193548387096


In [832]:
for dataset_key, dataset in dataset_name_mapping.items():
    print(dataset)
    filtered_df = qnli_results_df_filtered[qnli_results_df_filtered["source"] == dataset_key]
    print("FINE-TUNED MODEL")
    print(classification_report(filtered_df['golden_label'], filtered_df['inferred_label']))
    print("GPT4")
    print(classification_report(filtered_df['golden_label'], filtered_df['generated_label']))

StressTest
FINE-TUNED MODEL
               precision    recall  f1-score   support

contradiction       0.33      0.27      0.30        11
   entailment       0.29      0.25      0.27         8
      neutral       0.12      0.20      0.15         5

     accuracy                           0.25        24
    macro avg       0.25      0.24      0.24        24
 weighted avg       0.27      0.25      0.26        24

GPT4
               precision    recall  f1-score   support

contradiction       1.00      0.73      0.84        11
   entailment       0.78      0.88      0.82         8
      neutral       0.57      0.80      0.67         5

     accuracy                           0.79        24
    macro avg       0.78      0.80      0.78        24
 weighted avg       0.84      0.79      0.80        24

RTE_Quant
FINE-TUNED MODEL
              precision    recall  f1-score   support

  entailment       0.17      0.50      0.25         2
     neutral       0.75      0.38      0.50         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


StressTest
FINE-TUNED MODEL
               precision    recall  f1-score   support

contradiction       0.86      0.88      0.87       438
   entailment       0.81      0.90      0.85       355
      neutral       0.95      0.80      0.87       342

     accuracy                           0.86      1135
    macro avg       0.87      0.86      0.86      1135
 weighted avg       0.87      0.86      0.86      1135

GPT4
               precision    recall  f1-score   support

contradiction       0.87      0.67      0.76       438
   entailment       0.60      0.83      0.69       355
        error       0.00      0.00      0.00         0
      neutral       0.60      0.54      0.56       342

     accuracy                           0.68      1135
    macro avg       0.52      0.51      0.50      1135
 weighted avg       0.70      0.68      0.68      1135

RTE_Quant
FINE-TUNED MODEL
              precision    recall  f1-score   support

  entailment       0.65      0.76      0.70        17


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [833]:
dataset_function_name_map = {
    "stresstest": "entailment_or_contradiction_or_neutral(",
    "redditnli": "entailment_or_contradiction_or_neutral(",
    "awpnli": "entailment_or_contradiction(",
    "newsnli": "entailment_or_neutral(",
    "rtequant": "entailment_or_neutral("
}

def correct_function_name(script: str, dataset: str):
    return (not pd.isna(script)) and dataset_function_name_map[dataset] in script

test_set["correct_function_name"] = test_set.apply(lambda row: correct_function_name(row["inferred_script"], row["source"]), axis=1)

scripts_with_correct_function_name = test_set[test_set["correct_function_name"]==True].shape[0]
results_overview["correct_function_name"] = scripts_with_correct_function_name
scripts_with_correct_function_name

75

1668

In [834]:
def script_length_in_lines(script: str):
    if pd.isna(script):
        return 0
    return len(script.split("\n"))

test_set["expected_script_length"] = test_set["expected_script"].apply(lambda script: script_length_in_lines(script))
test_set["inferred_script_length"] = test_set["inferred_script"].apply(lambda script: script_length_in_lines(script))

test_set[test_set["expected_script_length"] == test_set["inferred_script_length"]].shape[0]

13

747

In [835]:
test_set[test_set["expected_script_length"] != test_set["inferred_script_length"]].head(5)

Unnamed: 0,sample_index,completion,prompt,source,input_prompt,inferred_completion,has_script,has_non_python_script,inferred_script,only_label,...,inferred_label,error_message,readability,document_size,redundancy_check,function_size,cc_resolutions,correct_function_name,expected_script_length,inferred_script_length
0,2266,```python\nmax_gloves_premise = 60\ngloves_hyp...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,# The [/INST] tag indicates the end of the in...,True,False,\nTrue\n,True,...,n,n,,,,,,False,14,3
1,7503,```python\nstretch_time_premise = 10\nmax_stre...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,#1\n\n### Output:\nTrue\n\n### Input:\nPremis...,False,False,,False,...,n,n,,,,,,False,15,0
2,7443,```python\nincome_spend_on_petrol_premise = 0....,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,\n\n### Output:\n\n```\nTrue\n```\n\n### Answe...,True,False,\n#[SOLUTION]\n\n#[/SOLUTION]\n,False,...,,,100.0,16.0,75.0,100.0,[36mExtremely large documents may deter peopl...,False,21,5
3,7552,```python\nmin_boys_premise = 2\nboys_hypothes...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,#1\n\n### Output:\nTrue\n\n### Input:\nPremis...,False,False,,False,...,n,n,,,,,,False,17,0
4,7412,```python\nbutter_kg_premise = 27\nbutter_kg_h...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,# [INST]\n\n### Output:\nTrue\n\n### Input:\n...,False,False,,False,...,n,n,,,,,,False,14,0


Unnamed: 0,sample_index,completion,prompt,source,input_prompt,inferred_completion,has_script,has_non_python_script,inferred_script,only_label,...,inferred_label,error_message,readability,document_size,redundancy_check,function_size,cc_resolutions,correct_function_name,expected_script_length,inferred_script_length
2,7443,```python\nincome_spend_on_petrol_premise = 0....,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,```python\npetrol_expense_premise = 30\npetrol...,True,False,\npetrol_expense_premise = 30\npetrol_expense_...,False,...,entailment,,100.0,56.0,85.71,50.0,[36mExtremely large documents may deter peopl...,True,21,15
3,7552,```python\nmin_boys_premise = 2\nboys_hypothes...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,\n```python\nmin_boys_premise = 2\nboys_hypoth...,True,False,\nmin_boys_premise = 2\nboys_hypothesis = 6\ng...,False,...,neutral,,100.0,72.0,83.33,50.0,[36mKeep functions small. Large functions get...,True,17,19
8,6428,```python\nsold_price_premise = 450\nsold_pric...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,\n```python\nsold_price_premise = 450\nsold_pr...,True,False,\nsold_price_premise = 450\nsold_price_hypothe...,False,...,contradiction,,100.0,56.0,85.71,50.0,[36mExtremely large documents may deter peopl...,True,18,15
10,56,```python\nbruce_speed_premise = 30\nbhishma_s...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,```python\nspeed_bruce_premise = 30\nspeed_b...,True,False,\nspeed_bruce_premise = 30\nspeed_bhishma_prem...,False,...,contradiction,,93.75,64.0,87.5,50.0,[36mExtremely large documents may deter peopl...,True,16,17
12,5927,```python\npurses_faiza_premise = 14\npurses_g...,### Instruction:\nYou need to reason about wea...,stresstest,<s>[INST]\n<<SYS>>\nGive a response suitable t...,\n```python\npurses_faiza_premise = 14\ngifted...,True,False,\npurses_faiza_premise = 14\ngifted_purses_pre...,False,...,contradiction,,93.75,64.0,87.5,50.0,[36mExtremely large documents may deter peopl...,True,18,17


## PREDICT MAJORITY CLASS (entailment, for all datasets) FOR SAMPLES WITH NO LABEL AND RECOMPUTE METRICS

In [836]:
qnli_results_df["inferred_label"].value_counts()

inferred_label
n                1457
                  206
entailment         29
error              20
neutral            15
contradiction      11
Name: count, dtype: int64

inferred_label
entailment       650
contradiction    575
neutral          432
nan               69
error             12
Name: count, dtype: int64

In [837]:
update_label_value_to_majority_label = ~qnli_results_df["inferred_label"].isin(["entailment", "contradiction", "neutral"])
qnli_results_df.loc[update_label_value_to_majority_label, "inferred_label"] = "entailment"

qnli_results_df["inferred_label"].value_counts()

inferred_label
entailment       1712
neutral            15
contradiction      11
Name: count, dtype: int64

inferred_label
entailment       731
contradiction    575
neutral          432
Name: count, dtype: int64

In [838]:
qnli_results_df["generated_label"].value_counts()

generated_label
entailment       813
neutral          461
contradiction    459
error              5
Name: count, dtype: int64

generated_label
entailment       813
neutral          461
contradiction    459
error              5
Name: count, dtype: int64

In [839]:
update_label_value_to_majority_label = qnli_results_df["generated_label"].isin(["nan", "error"])
qnli_results_df.loc[update_label_value_to_majority_label, "generated_label"] = "entailment"
qnli_results_df["generated_label"].value_counts()

generated_label
entailment       818
neutral          461
contradiction    459
Name: count, dtype: int64

generated_label
entailment       818
neutral          461
contradiction    459
Name: count, dtype: int64

In [845]:
from sklearn.metrics import accuracy_score, classification_report

print(f"Accuracy (fine-tuned model): {accuracy_score(qnli_results_df['golden_label'], qnli_results_df['inferred_label'])}")

print(f"Accuracy (GPT-generated data): {accuracy_score(qnli_results_df['golden_label'], qnli_results_df['generated_label'])}")

Accuracy (fine-tuned model): 0.7986191024165707
Accuracy (GPT-generated data): 0.7025316455696202


In [846]:
print(classification_report(qnli_results_df['golden_label'], qnli_results_df['inferred_label']))

               precision    recall  f1-score   support

contradiction       0.83      0.86      0.84       560
   entailment       0.74      0.83      0.78       647
      neutral       0.85      0.69      0.77       531

     accuracy                           0.80      1738
    macro avg       0.81      0.79      0.80      1738
 weighted avg       0.80      0.80      0.80      1738



In [847]:
print(classification_report(qnli_results_df['golden_label'], qnli_results_df['generated_label']))

               precision    recall  f1-score   support

contradiction       0.86      0.70      0.77       560
   entailment       0.66      0.83      0.73       647
      neutral       0.63      0.55      0.59       531

     accuracy                           0.70      1738
    macro avg       0.71      0.69      0.70      1738
 weighted avg       0.71      0.70      0.70      1738



In [848]:
for dataset_key, dataset in dataset_name_mapping.items():
    print(dataset)
    filtered_df = qnli_results_df[qnli_results_df["source"] == dataset_key]

    print(f"Accuracy (fine-tuned model): {accuracy_score(filtered_df['golden_label'], filtered_df['inferred_label'])}")

    print(f"Accuracy (GPT-4 data): {accuracy_score(filtered_df['golden_label'], filtered_df['generated_label'])}")

StressTest
Accuracy (fine-tuned model): 0.8293487221764221
Accuracy (GPT-4 data): 0.6669414674361088
RTE_Quant
Accuracy (fine-tuned model): 0.7317073170731707
Accuracy (GPT-4 data): 0.926829268292683
AWPNLI
Accuracy (fine-tuned model): 0.8944444444444445
Accuracy (GPT-4 data): 0.9388888888888889
NewsNLI
Accuracy (fine-tuned model): 0.640495867768595
Accuracy (GPT-4 data): 0.6735537190082644
RedditNLI
Accuracy (fine-tuned model): 0.5806451612903226
Accuracy (GPT-4 data): 0.6774193548387096


In [844]:
for dataset_key, dataset in dataset_name_mapping.items():
    print(dataset)
    filtered_df = qnli_results_df[qnli_results_df["source"] == dataset_key]
    print("FINE-TUNED MODEL")
    print(classification_report(filtered_df['golden_label'], filtered_df['inferred_label']))
    print("GPT4")
    print(classification_report(filtered_df['golden_label'], filtered_df['generated_label']))

StressTest
FINE-TUNED MODEL
               precision    recall  f1-score   support

contradiction       0.33      0.01      0.01       465
   entailment       0.31      0.98      0.48       382
      neutral       0.12      0.00      0.01       366

     accuracy                           0.31      1213
    macro avg       0.26      0.33      0.16      1213
 weighted avg       0.26      0.31      0.16      1213

GPT4
               precision    recall  f1-score   support

contradiction       0.84      0.66      0.74       465
   entailment       0.60      0.82      0.69       382
      neutral       0.59      0.52      0.55       366

     accuracy                           0.67      1213
    macro avg       0.67      0.67      0.66      1213
 weighted avg       0.69      0.67      0.67      1213

RTE_Quant
FINE-TUNED MODEL
              precision    recall  f1-score   support

  entailment       0.43      0.94      0.59        17
     neutral       0.75      0.12      0.21        24



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


StressTest
FINE-TUNED MODEL
               precision    recall  f1-score   support

contradiction       0.86      0.83      0.84       465
   entailment       0.73      0.91      0.81       382
      neutral       0.95      0.75      0.84       366

     accuracy                           0.83      1213
    macro avg       0.84      0.83      0.83      1213
 weighted avg       0.84      0.83      0.83      1213

GPT4
               precision    recall  f1-score   support

contradiction       0.84      0.66      0.74       465
   entailment       0.60      0.82      0.69       382
      neutral       0.59      0.52      0.55       366

     accuracy                           0.67      1213
    macro avg       0.67      0.67      0.66      1213
 weighted avg       0.69      0.67      0.67      1213

RTE_Quant
FINE-TUNED MODEL
              precision    recall  f1-score   support

  entailment       0.65      0.76      0.70        17
     neutral       0.81      0.71      0.76        24

