In [1]:
%load_ext autoreload
%autoreload 2

from coeditor.common import *
import os

from coeditor.dataset import TokenizedEditDataset
from coeditor.model import CoeditorModel, EvalArgs, DecodingArgs
from coeditor.encoding import TokenizedEdit, decode_tokens, tokens_to_change, AnalysisBasedEditEncoder
from coeditor.history import Added, Modified
import shutil
import random
from prepare_data import make_or_load_datasets, dataset_from_projects, get_commit_history

os.chdir(proj_root())

In [3]:
# test_data_name = "medium"
test_data_name = "SPOT"
encoder = AnalysisBasedEditEncoder(extra_ctx_names=("usees", "post-usees"))
test_data = dataset_from_projects([proj_root()], encoder, [False], max_history_per_repo=20)
print("num addition: ", sum(isinstance(e.main_change, Added) for e in test_data.all_edits()))
print("num modifications: ", sum(isinstance(e.main_change, Modified) for e in test_data.all_edits()))

Getting commit histories: 100%|██████████| 1/1 [00:01<00:00,  1.16s/repo]
Create tokenized edits: 100%|██████████| 1/1 [00:30<00:00, 30.20s/chunk]

num addition:  0
num modifications:  116





In [2]:
test_data_name = "medium"
# test_data_name = "SPOT"
encoder = AnalysisBasedEditEncoder(extra_ctx_names=("usees", "post-usees"))
datasets = make_or_load_datasets(test_data_name, encoder, recreate_data=False)
test_data = datasets["train"]

In [3]:
min_ex = min(test_data.all_edits(), key=lambda e: len(e.output_tks))
print(min_ex.show())

path: latexify.codegen.function_codegen_test/test_visit_binop

<add> @pytest.mark.parametrize(
<add>     "code,latex",
<add>     [
<add>         ("x**y", r"x^{y}"),
<add>         ("x * y", r"x y"),
<add>         ("x @ y", r"x y"),
<add>         ("x / y", r"\frac{x}{y}"),
<add>         ("x // y", r"\left\lfloor\frac{x}{y}\right\rfloor"),
<add>         ("x % y", r"x \mathbin{\%} y"),
<add>         ("x + y", r"x + y"),
<add>         ("x - y", r"x - y"),
<add>         ("x << y", r"x \ll y"),
<add>         ("x >> y", r"x \gg y"),
<add>         ("x & y", r"x \mathbin{\&} y"),
<add>         ("x ^ y", r"x \oplus y"),
<add>         ("x | y", R"x \mathbin{|} y"),
<add>         ("(x**y)**z", r"\left( x^{y} \right)^{z}"),
<add>         ("(x * y) * z", r"x y z"),
<add>         ("(x @ y) @ z", r"x y z"),
<add>         ("(x / y) / z", r"\frac{\frac{x}{y}}{z}"),
<add>         (
<add>             "(x // y) // z",
<add>             r"\left\lfloor\frac{\left\lfloor\frac{x}{y}\right\rfloor}{z}\right\rfloo

In [3]:
rep_edits = pfilter(TokenizedEdit.is_repetitive_edit, test_data.all_edits())
small_edits = pfilter(TokenizedEdit.is_small_edit, test_data.all_edits())
refactor_edits = [e for e in test_data.all_edits() if e.updated_calls]
print("Total edits: ", len(list(test_data.all_edits())))
print("Repetitive edits: ", len(rep_edits))
print("Small edits: ", len(small_edits))
print("Refactoring edits: ", len(refactor_edits))

filtering: 100%|██████████| 1395/1395 [00:01<00:00, 914.09it/s] 
filtering: 100%|██████████| 1395/1395 [00:00<00:00, 4036.99it/s]

Total edits:  1395
Repetitive edits:  75
Small edits:  805
Refactoring edits:  182





In [4]:
# model_dir = get_model_dir() / "coeditor-medium-file"
model_dir = get_model_dir() / "coeditor-medium-analysis-post_usees"
model = CoeditorModel.load_pretrained(model_dir)
model.to("cuda:1")

eval_args = EvalArgs(4096 * 2)
dec_args = DecodingArgs(num_beams=1)

In [5]:
eval_dir = model_dir / "evals" / test_data_name
eval_cache = PickleCache(eval_dir)

In [16]:
model.data_args.use_signature_prefix = False
refactor_data = TokenizedEditDataset.from_edits(refactor_edits)
refactor_result = model.predict_on_data(refactor_data, eval_args, dec_args)
call_acc, call_correct = refactor_result.call_update_accuracy()
print("Call update accuracy: ", call_acc)

decoding: 100%|██████████| 38/38 [02:01<00:00,  3.20s/batch]


20 / 200 calls were considered incorrect since they failed to parse.
Call update accuracy:  (mean=0.31, weight=200)


In [6]:
model.data_args.use_signature_prefix = False
refactor_data = TokenizedEditDataset.from_edits(refactor_edits)
eval_args = EvalArgs(4096)
dec_args = DecodingArgs(num_beams=6)
refactor_result = model.predict_on_data(refactor_data, eval_args, dec_args)
call_acc, call_correct = refactor_result.call_update_accuracy()
print("(use_signature_prefix) Call update accuracy: ", call_acc)

decoding: 100%|██████████| 166/166 [06:28<00:00,  2.34s/batch]


19 / 200 calls were considered incorrect since they failed to parse.
(use_signature_prefix) Call update accuracy:  (mean=0.355, weight=200)


In [14]:
model.data_args.use_signature_prefix = True
refactor_data = TokenizedEditDataset.from_edits(refactor_edits)
eval_args = eval_args = EvalArgs(4096 * 4)
refactor_result = model.predict_on_data(refactor_data, eval_args, dec_args)
call_acc, call_correct = refactor_result.call_update_accuracy()
print("(use_signature_prefix) Call update accuracy: ", call_acc)

decoding: 100%|██████████| 38/38 [02:33<00:00,  4.05s/batch]


18 / 200 calls were considered incorrect since they failed to parse.
(use_signature_prefix) Call update accuracy:  (mean=0.39, weight=200)


In [11]:
out_dir = eval_dir / "CallUpdateAccuracy.txt"
refactor_result.save_examples_to_dir(out_dir, call_correct)
print(out_dir)

saving examples: 100%|██████████| 182/182 [00:01<00:00, 92.27it/s] 

/mnt/nas/jiayi/coeditor/models/trained/coeditor-medium-analysis-post_usees/evals/SPOT/CallUpdateAccuracy.txt





In [9]:
rep_data = TokenizedEditDataset.from_edits(rep_edits)
rep_result = model.predict_on_data(rep_data, eval_args, dec_args)
display(rep_result.exact_match_accuracy()[0])

(mean=0.74667, weight=75)

In [7]:
dec_args = DecodingArgs(num_beams=1)
dec_result = eval_cache.cached("DatasetDecodingResult.pkl", lambda: model.predict_on_data(test_data, eval_args, dec_args))