# Evaluating the Refactory Model on the Dublin Program Repair Dataset 

#### Running this notebook is optional, since we already compiled the data for you.

In [None]:
from datasets import disable_caching

disable_caching()

In [None]:
import os, sys
sys.path.append("../")
sys.path.append("../../")

In [None]:
from src.utils.files import read_config

ref_config = read_config("../configs/conf.json")
ref_config

In [None]:
from datasets import load_dataset, concatenate_datasets

annotated_dataset = load_dataset("koutch/intro_prog", "dublin_repair")
annotated_dataset = concatenate_datasets(list(annotated_dataset.values())).to_pandas()
original_dataset = load_dataset("koutch/intro_prog", "dublin_data")
original_dataset = concatenate_datasets(list(original_dataset.values())).to_pandas()

In [None]:
annotated_dataset["assignment_id"]

In [None]:
original_dataset["assignment_id"]

In [None]:
import pandas as pd 
from src.common import new_assignments_id

annotated_dataset = annotated_dataset[annotated_dataset.annotation.astype(bool)]
ann_subids = set(annotated_dataset["submission_id"])
original_dataset = original_dataset[original_dataset["correct"]]
original_dataset = original_dataset[~original_dataset.submission_id.isin(ann_subids)]
assignments = set(annotated_dataset["assignment_id"]).intersection(original_dataset["assignment_id"])
original_dataset = original_dataset[original_dataset.assignment_id.isin(assignments)]
annotated_dataset = annotated_dataset[annotated_dataset.assignment_id.isin(assignments)]
annotated_dataset["correct"] = False

dataset = pd.concat([annotated_dataset, original_dataset], axis=0)
dataset = dataset.replace(new_assignments_id)
dataset

Loading the annotations of the buggy code from the online interface. The "train" split is a default split, but it it's not meaningful. We need also access to correct solutions. Let's load them from the original data.Moreover, the academic years might have had different assignments. Refactory cannot repair buggy programs if no data about these assignments are available, so we ensure we only keep assignments present accross the two splits

from datasets import load_dataset, concatenate_datasets

annotated_dataset = load_dataset("koutch/intro_prog", "dublin_repair")["train"]
annotated_dataset = annotated_dataset.filter(lambda ex: bool(ex["annotation"]))
ann_subids = set(annotated_dataset["submission_id"])
if ref_config.split_year:
    annotated_dataset = annotated_dataset.filter(lambda ex: ex["academic_year"] ==2017)
    
hgf_dataset = load_dataset("koutch/intro_prog", "dublin_data")

# We take the correct solutions from the training split (2015-21016)
if ref_config.split_year:
    original_dataset = hgf_dataset["train"]
else:
    original_dataset = concatenate_datasets(list(hgf_dataset.values()))
    
original_dataset = original_dataset.filter(lambda ex: ex["correct"])
original_dataset = original_dataset.filter(lambda ex: ex["submission_id"] not in ann_subids)

assignments = set(annotated_dataset["assignment_id"]).intersection(original_dataset["assignment_id"])
filter_f = lambda ex: ex["assignment_id"] in assignments
original_dataset = original_dataset.filter(filter_f)
annotated_dataset = annotated_dataset.filter(filter_f)

original_dataset, annotated_dataset

In [None]:
annotated_dataset

from datasets import Value

def add_correctness(example):
    example["correct"] = False
    return example
    
annotated_dataset = annotated_dataset.map(add_correctness)
new_features = annotated_dataset.features.copy()
new_features["academic_year"] = Value("int32")
annotated_dataset = annotated_dataset.cast(new_features)
dataset = concatenate_datasets([original_dataset, annotated_dataset])
dataset

In [None]:
from datasets import Dataset
from src.common import new_assignments_id

df = dataset
df = df.replace(new_assignments_id)
dataset = Dataset.from_pandas(df)
dataset

In [None]:
from src.refactory import create_save_dir

create_save_dir(dataset, ref_config.save_path)

In [None]:
from src.utils.files import create_dir

hgf_save_path = os.path.join(ref_config.save_path, 'hgf')
# Be careful bellow, it might overide the location
# create_dir(hgf_save_path)
# dataset.save_to_disk(hgf_save_path)

To run our evaluation using Refactory, we need to provide the location to a folder where the algorithm is going to read files and perform the corrections. In order to run the code, one should download the [refactory tool online](https://github.com/githubhuyang/refactory) according to their instructions. Then, execute the repair tool by passing by the path to the temporary directory where the data was formatted.


```
python3 run.py -d ./[PATH_TO_TEMP -q question_1 question_2 question_3 question_4 ... question_10 -s 100 -o -m -b
```

To look at the results, follow the script refactory_analysis.ipynb