In [1]:
%load_ext autoreload
%autoreload 2

from coeditor.common import *
import os

os.chdir(proj_root())

This notebook demonstrates how to extract code completion problems from real code changes. Let's start by picking 4 random commits from the history of this project.

In [2]:
from coeditor.git import get_commit_history, CommitInfo

repo_root = proj_root()

commits = get_commit_history(
    repo_root, 4, commit_id="84cfd5206348ecc3f54d202b830f803d8a03f26f"
)
for c in commits[:3]:
    print(c)


CommitInfo(hash='84cfd5206348ecc3f54d202b830f803d8a03f26f', parents=('a1e2b73ab836924d0b1f9ed88e4fd90e7a6f61e6',), msg='Add ablation: dense attention.')
CommitInfo(hash='a1e2b73ab836924d0b1f9ed88e4fd90e7a6f61e6', parents=('ecdbdc3875e47887ff3c0320fd1367af28d0a491',), msg='Implement ablation: current_code_only.')
CommitInfo(hash='ecdbdc3875e47887ff3c0320fd1367af28d0a491', parents=('ad918b35e2b8314f30a7f8ffc1e957c9f49956df',), msg='Exclude builtins defs in ctx by default.')


We can then extract `FIMProblem` instances from these commits using `edits_from_commit_history` function by specifying `C3CompletionGenerator` as the change_processor.

In [3]:
from coeditor.scoped_changes import edits_from_commit_history
from coeditor.experiments.code_completion import FIMProblem, C3CompletionGenerator

generator = C3CompletionGenerator()

workdir = proj_root() / "../temp-1"
fim_problems = edits_from_commit_history(
    repo_root, commits, workdir, change_processor=generator
)
print(f"{len(fim_problems) = }")


building initial project: 100%|██████████| 34/34 [00:01<00:00, 21.39it/s]
processing commits: 100%|██████████| 3/3 [00:07<00:00,  2.41s/it]

len(fim_problems) = 5





We can now visualize an example problem by first converting it into the input-output format used by CodeT5. Each problem instance asks the model to predict a missing line extracted that correspond to the last added line from the actual changes made to a given code region. Note that in this format, any previous changes made by the user are directly applied to the code that surrounds the missing line.

Feel free to change `ex_id` to see other examples.

In [4]:
from coeditor.encoding import decode_tokens, _Tokenizer


ex_id = 0
left, right = fim_problems[ex_id].get_contexts(_Tokenizer)
middle = fim_problems[ex_id].middle

# we use decode_tokens to convert the token sequences into strings
print_sections(
    ("middle", middle),
    ("left", left),
    ("right", right),
)


--------------------------------------------------------------------------------
middle:
                seg = seg + origin_line + [Newline_id]
--------------------------------------------------------------------------------
left:
,
) -> tuple[TokenSeq, ...]:
    """Truncate a list of token sequences to fit within a total length limit.
    Earlier sections have priority over later sections.
    """

    # first, reserve equal space to each section
    section_lens = [total_limit // len(sections) for _ in sections]
    remaining = total_limit
    for i, (tks, _) in enumerate(sections):
        l = min(len(tks), section_lens[i])
        remaining -= l
        section_lens[i] = l
    assert remaining >= 0

    # for the unused space, assign to ealier sections when possible
    for i, (tks, _) in enumerate(sections):
        if remaining <= 0:
            break
        inc = min(remaining, len(tks) - section_lens[i])
        section_lens[i] += inc
        remaining -= inc

    return tuple

Now let's load the CodeT5 and InCoder model and see how they perform on these problems.

In [5]:
from coeditor.experiments.code_completion import CodeT5Wrapper
codet5 = CodeT5Wrapper.from_pretrained("Salesforce/codet5-large")
# codet5 = CodeT5Wrapper.from_pretrained()
codet5.model.half().to("cuda")
None

In [6]:
print_sections(
    ("predicted", codet5.infill(left, right)),
    ("label", middle),
)

--------------------------------------------------------------------------------
predicted:
                return
--------------------------------------------------------------------------------
label:
                seg = seg + origin_line + [Newline_id]


In [7]:
from coeditor.experiments.in_coder import InCoderWrapper

incoder = InCoderWrapper.from_pretrained("facebook/incoder-1B", half_precision=True)
incoder.model.to("cuda")
None

In [8]:
left, right = fim_problems[ex_id].get_contexts(incoder.tokenizer)
print_sections(
    ("predicted", incoder.infill(left, right)),
    ("label", middle),
)

Token indices sequence length is longer than the specified maximum sequence length for this model (3943 > 2048). Running this sequence through the model will result in indexing errors


--------------------------------------------------------------------------------
predicted:
                indent = " " * 4
                label = cls.show_label(id_map.get(k, -1))
                lines.append(f"{label}:{indent(origin_line, ' ' * 4).lstrip()}")
            else:
                # show the added line
                section_lines = tk_splitlines(main_tk_lines.get(k, TokenSeq()))
                if section_lines:
                    origin_line = section_lines[-1]
                else:
                    origin_line = cls.BAD_DELETE
                indent = " " * 4
--------------------------------------------------------------------------------
label:
                seg = seg + origin_line + [Newline_id]


Now let's compare this with a different format that models this problem as a special case of code editing. To do that, we will run `edits_from_commit_history` again but with `C3ProblemGenerator` as the `change_processor`. This will give us `C3Problem` instances, which correspond to general contextual code change prediction problem. We can then convert them into instances that are similar to  similar to the `FIMProblem` problems above using `C3ToCodeCompletion`.  will then convert into  we then convert them into 

In [9]:
from coeditor.c3problem import C3ProblemGenerator, C3ToCodeCompletion

c3_problems = edits_from_commit_history(
    repo_root, commits, workdir, change_processor=C3ProblemGenerator()
)
print(f"{len(c3_problems) = }")

transform = C3ToCodeCompletion()
comp_probs = join_list([transform.transform(p) for p in c3_problems])
print(f"{len(comp_probs) = }")


building initial project: 100%|██████████| 34/34 [00:00<00:00, 107.66it/s]
processing commits: 100%|██████████| 3/3 [00:17<00:00,  5.85s/it]

len(c3_problems) = 6
len(comp_probs) = 5





In [10]:
from coeditor.c3problem import C3ProblemTokenizer


tknizer = C3ProblemTokenizer(max_ref_tks_sum=2000)
tk_prob = tknizer.tokenize_problem(comp_probs[ex_id])
print(tk_prob.show())


--------------------------------------------------------------------------------
path: coeditor.encoding/TokenizedEdit.show_predictions
n_references: 1
total_reference_tks: 500
project: temp-1
commit: CommitInfo(hash='a1e2b73ab836924d0b1f9ed88e4fd90e7a6f61e6', parents=('ecdbdc3875e47887ff3c0320fd1367af28d0a491',), msg='Implement ablation: current_code_only.')
 <0>:<add>                 seg = seg + origin_line + [Newline_id]

      # module: coeditor.encoding
      class TokenizedEdit(ABC):
          @classmethod
          def show_predictions(
              cls, pred: TokenSeq, main_tk_lines: dict[Token, TokenSeq]
          ) -> str:
              id_map = {k: i for i, k in enumerate(main_tk_lines)}
              segs = output_ids_as_seqs(pred)
              lines = []
              for k, seg in segs.items():
                  if not seg:
                      continue  # skip empty lines
                  if seg[-1] == Del_id:
                      # show the deleted line
           

In [11]:
from coeditor.model import RetrievalEditorModel
from coeditor.experiments.code_completion import infill_with_coeditor

coeditor = RetrievalEditorModel.load("MrVPlusOne/coeditor-perm2k-base-v1.7.3")
coeditor.to("cuda")
None

In [12]:
print_sections(
    ("predicted", decode_tokens(infill_with_coeditor(coeditor, tk_prob))),
    ("label", middle),
)

--------------------------------------------------------------------------------
predicted:
<pad><s><extra_id_0> <add>                 seg = seg + origin_line + [Newline_id]
</s>
--------------------------------------------------------------------------------
label:
                seg = seg + origin_line + [Newline_id]


In [13]:
from coeditor.encoding import inline_output_tokens


inlined_tks = inline_output_tokens(tk_prob.main_tks, infill_with_coeditor(coeditor, tk_prob))
print(decode_tokens(inlined_tks))

# module: coeditor.encoding
class TokenizedEdit(ABC):
    @classmethod
    def show_predictions(
        cls, pred: TokenSeq, main_tk_lines: dict[Token, TokenSeq]
    ) -> str:
        id_map = {k: i for i, k in enumerate(main_tk_lines)}
        segs = output_ids_as_seqs(pred)
        lines = []
        for k, seg in segs.items():
            if not seg:
                continue  # skip empty lines
            if seg[-1] == Del_id:
                # show the deleted line
                section_lines = tk_splitlines(main_tk_lines.get(k, TokenSeq()))
                if section_lines:
                    origin_line = section_lines[0]
                else:
                    origin_line = cls.BAD_DELETE
 <del>                 origin_line.append(Newline_id)
 <del>                 seg = seg + origin_line
 <add>                 seg = seg + origin_line + [Newline_id]
            label = cls.show_label(id_map.get(k, -1))
            lines.append(f"{label}:{indent(decode_tokens(seg), ' ' * 4)

In [14]:
for p in fim_problems:
    print(SEP)
    print(p.middle)

print("=" * 100)
for p in comp_probs:
    print(SEP)
    print(p.span.delta)

--------------------------------------------------------------------------------
                seg = seg + origin_line + [Newline_id]
--------------------------------------------------------------------------------
                all_refs.append((f"unchanged ref {i}", chunk))
--------------------------------------------------------------------------------
            save_steps=max(500, min(10000, epoch_steps // 5)),
--------------------------------------------------------------------------------
    eval_tkn.max_query_tks = 1024
--------------------------------------------------------------------------------
                hidden_state_mask=tks_mask,
--------------------------------------------------------------------------------
TkDelta(
  15: ('<add>                 seg = seg + origin_line + [Newline_id]',)
)
--------------------------------------------------------------------------------
TkDelta(
  107: ('<add>                 all_refs.append((f"unchanged ref {i}", chunk))',)
)