In [30]:
import traceback

import pandas as pd
import os
import random
from dataset_validation import validate_code_quality

In [9]:
root_path = os.path.dirname(os.getcwd())
lila_equate_source_path = os.path.join(root_path, "data", "lila-equate")

### Randomly sample 100 examples from each dataset

In [18]:
# datasets = ["AWPNLI", "NewsNLI", "RTE_Quant", "StressTest"]
# sampled_indices = dict()
#
# for dataset in datasets:
#     df = pd.read_csv(os.path.join(lila_equate_source_path, dataset, "train.csv"))
#     os.makedirs(os.path.join(lila_equate_source_path, dataset, "train"))
#     indices = list(df["sample_index"].unique())
#     sample = random.sample(population=indices, k=100)
#     sampled_indices[dataset] = sample
#     for idx in sample:
#         with open(os.path.join(lila_equate_source_path, dataset, "train", f"sample_{idx}.py"), 'x') as f:
#             f.write(df[df["sample_index"]==idx]["lila_script"].iloc[0])

In [20]:
# st_test = pd.read_csv(os.path.join(lila_equate_source_path, "StressTest", "test.csv"))
# sample = random.sample(population=list(st_test["sample_index"].unique()), k=100)
# sampled_indices["StressTest_test"] = sample
# os.makedirs(os.path.join(lila_equate_source_path, "StressTest", "test"))
# for idx in sample:
#     with open(os.path.join(lila_equate_source_path, "StressTest", "test", f"sample_{idx}.py"), 'x') as f:
#         f.write(st_test[st_test["sample_index"]==idx]["lila_script"].iloc[0])

In [24]:
output_path = os.path.join(root_path, "data", "code_quality", "lila")
os.makedirs(output_path, exist_ok=True)

for dataset in ["StressTest"]:
    print(dataset)
    evaluations = []
    scripts_path = os.path.join(lila_equate_source_path, dataset, "test")
    try:
        for file in os.listdir(scripts_path):
            index = int(file.split(".")[0].split("_")[-1])
            scores_dict, resolutions = validate_code_quality(os.path.join(scripts_path, file))
            if scores_dict:
                scores_dict.update({"resolutions": resolutions, "sample_index": index})
                evaluations.append(scores_dict)
            else:
                print(index)
    except Exception as e:
        print(traceback.print_exc())
    finally:
        df = pd.DataFrame(evaluations)
        df.to_csv(os.path.join(output_path, f"{dataset}_test.csv"), index=False)

StressTest
0.0 32.0 100.0 100.0
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']
0.0 32.0 100.0 100.0
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']
0.0 32.0 100.0 100.0
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']
0.0 32.0 100.0 100.0
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']
0.0 32.0 

In [31]:
output_path = os.path.join(root_path, "data", "code_quality", "ours-gpt4")
os.makedirs(output_path, exist_ok=True)

for dataset in ["RTE_Quant", "NewsNLI", "AWPNLI"]:
    evaluations = []
    scripts_path = os.path.join(root_path, "data", "generated", dataset, "gpt4")
    try:
        for file in os.listdir(scripts_path):
            if not file.endswith(".py"):
                continue
            index = int(file.split(".")[0].split("_")[-1])
            if index not in sampled_indices[dataset]:
                continue
            scores_dict, resolutions = validate_code_quality(os.path.join(scripts_path, file))
            if scores_dict:
                scores_dict.update({"resolutions": resolutions, "sample_index": index})
                evaluations.append(scores_dict)
            else:
                print(index)
    except Exception as e:
        print(traceback.print_exc())
    finally:
        df = pd.DataFrame(evaluations)
        df.to_csv(os.path.join(output_path, f"{dataset}_train.csv"), index=False)

100.0 64.0 68.75 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 68.0 70.59 50.0
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 76.0 73.68 50.0
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 68.0 70.59 50.0
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 64.0 68.75 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 76.0 73.68 50.0
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 76.0 73.68 50.0
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
100.0 72.0 72.22 50.0
['\x1

### CODE QUALITY RESULTS FOR LILA

In [25]:
for dataset in ["StressTest", "AWPNLI", "RTE_Quant", "NewsNLI"]:
    quality_df = pd.read_csv(os.path.join(output_path, f"{dataset}_train.csv"))
    print(f"############{dataset}############")
    print(f"Readability: {quality_df['readability'].mean()} (+-{quality_df['readability'].std()})")
    print(f"Redundancy: {quality_df['redundancy_check'].mean()} (+-{quality_df['redundancy_check'].std()})")
    print(f"Document size: {quality_df['document_size'].mean()} (+-{quality_df['document_size'].std()})")
    print(f"Function size: {quality_df['function_size'].mean()} (+-{quality_df['function_size'].std()})")
    print(f"Resolutions:\n{quality_df.iloc[0]['resolutions']}")

############StressTest############
Readability: 0.0 (+-0.0)
Redundancy: 85.23599999999999 (+-8.38821010030559)
Document size: 45.04 (+-7.992319545486888)
Function size: 100.0 (+-0.0)
Resolutions:
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']
############AWPNLI############
Readability: 0.0 (+-0.0)
Redundancy: 86.0855 (+-10.074809106645821)
Document size: 42.36 (+-6.178359103525887)
Function size: 100.0 (+-0.0)
Resolutions:
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']
############RTE_Quant############
Readability: 0.0 (+-0.0)
Redundancy: 88.48749999999998 (+-9.422987299496093)
Document size: 41.84 (+-5.78175465310375)
Function size: 100.

In [26]:
quality_df = pd.read_csv(os.path.join(output_path, f"StressTest_test.csv"))
print(f"############StressTest############")
print(f"Readability: {quality_df['readability'].mean()} (+-{quality_df['readability'].std()})")
print(f"Redundancy: {quality_df['redundancy_check'].mean()} (+-{quality_df['redundancy_check'].std()})")
print(f"Document size: {quality_df['document_size'].mean()} (+-{quality_df['document_size'].std()})")
print(f"Function size: {quality_df['function_size'].mean()} (+-{quality_df['function_size'].std()})")
print(f"Resolutions:\n{quality_df.iloc[0]['resolutions']}")

############StressTest############
Readability: 0.5556 (+-5.556)
Redundancy: 98.90919999999998 (+-4.339261604550628)
Document size: 33.04 (+-3.041530715102034)
Function size: 100.0 (+-0.0)
Resolutions:
['\x1b[36mAdding comments, giving helpful variable names, consistent casing may help with readability.\x1b[0m', '\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m']


In [32]:
for dataset in ["AWPNLI", "RTE_Quant", "NewsNLI"]:
    quality_df = pd.read_csv(os.path.join(root_path, "data", "code_quality", "ours-gpt4", f"{dataset}_train.csv"))
    print(f"############{dataset}############")
    print(f"Readability: {quality_df['readability'].mean()} (+-{quality_df['readability'].std()})")
    print(f"Redundancy: {quality_df['redundancy_check'].mean()} (+-{quality_df['redundancy_check'].std()})")
    print(f"Document size: {quality_df['document_size'].mean()} (+-{quality_df['document_size'].std()})")
    print(f"Function size: {quality_df['function_size'].mean()} (+-{quality_df['function_size'].std()})")
    print(f"Resolutions:\n{quality_df.iloc[0]['resolutions']}")

############AWPNLI############
Readability: 100.0 (+-0.0)
Redundancy: 72.4104347826087 (+-0.5027513234996355)
Document size: 72.52173913043478 (+-1.3774008863003637)
Function size: 50.0 (+-0.0)
Resolutions:
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
############RTE_Quant############
Readability: 100.0 (+-0.0)
Redundancy: 72.50220000000002 (+-3.0386651707949044)
Document size: 75.8 (+-11.160898555477358)
Function size: 50.0 (+-0.0)
Resolutions:
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
############NewsNLI############
Readability: 100.0 (+-0.0)
Redundancy: 72.55840000000002 (+-2.8406791477042983)
Document size: 75.44 (+-12.6748936600245)
Function size: 50.5 (+-5.0)
Resolutions:
['\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']


In [39]:
output_path = os.path.join(root_path, "data", "code_quality", "ours-no_comments")
os.makedirs(output_path, exist_ok=True)

for dataset in ["AWPNLI"]:
    evaluations = []
    scripts_path = os.path.join(root_path, "data", "generated", dataset, "cc_no_comments")
    try:
        for file in os.listdir(scripts_path):
            if not file.endswith(".py"):
                continue
            index = int(file.split(".")[0].split("_")[-1])
            if index not in sampled_indices[dataset]:
                continue
            with open(os.path.join(scripts_path, file), 'r') as f:
                lines = f.readlines()
                idx = 0
                for idx, line in enumerate(lines):
                    if line.startswith("# Golden Label:"):
                        break
                lines = lines[idx+1:]
            with open(os.path.join(scripts_path, file), 'w') as f:
                f.write("".join(lines))
            scores_dict, resolutions = validate_code_quality(os.path.join(scripts_path, file))
            if scores_dict:
                scores_dict.update({"resolutions": resolutions, "sample_index": index})
                evaluations.append(scores_dict)
            else:
                print(index)
    except Exception as e:
        print(traceback.print_exc())
    finally:
        df = pd.DataFrame(evaluations)
        df.to_csv(os.path.join(output_path, f"{dataset}_train.csv"), index=False)

71.43 56.0 71.43 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
71.43 56.0 71.43 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
71.43 56.0 71.43 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
71.43 56.0 71.43 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
71.43 56.0 71.43 50.0
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful

In [40]:
for dataset in ["AWPNLI"]:
    quality_df = pd.read_csv(os.path.join(root_path, "data", "code_quality", "ours-no_comments", f"{dataset}_train.csv"))
    print(f"############{dataset}############")
    print(f"Readability: {quality_df['readability'].mean()} (+-{quality_df['readability'].std()})")
    print(f"Redundancy: {quality_df['redundancy_check'].mean()} (+-{quality_df['redundancy_check'].std()})")
    print(f"Document size: {quality_df['document_size'].mean()} (+-{quality_df['document_size'].std()})")
    print(f"Function size: {quality_df['function_size'].mean()} (+-{quality_df['function_size'].std()})")
    print(f"Resolutions:\n{quality_df.iloc[0]['resolutions']}")

############AWPNLI############
Readability: 70.80913043478263 (+-1.6391070546974345)
Redundancy: 71.67782608695654 (+-0.6542654209926697)
Document size: 56.52173913043478 (+-1.3774008863003637)
Function size: 50.0 (+-0.0)
Resolutions:
['\x1b[36mExtremely large documents may deter people from reading, very small document may be wasteful.\x1b[0m', '\x1b[36mKeep functions small. Large functions get unmanageable over time.\x1b[0m']
