In [1]:
%load_ext autoreload
%autoreload 2

from coeditor.common import *
import os

os.chdir(proj_root())

This notebook demonstrates how to extract code completion problems from real code changes. Let's start by picking 4 random commits from the history of this project.

In [2]:
from coeditor.git import get_commit_history, CommitInfo

repo_root = proj_root()

commits = get_commit_history(repo_root, 4, commit_id="84cfd5206348ecc3f54d202b830f803d8a03f26f")
for c in commits[:3]:
    print(c)


CommitInfo(hash='84cfd5206348ecc3f54d202b830f803d8a03f26f', parents=('a1e2b73ab836924d0b1f9ed88e4fd90e7a6f61e6',), msg='Add ablation: dense attention.')
CommitInfo(hash='a1e2b73ab836924d0b1f9ed88e4fd90e7a6f61e6', parents=('ecdbdc3875e47887ff3c0320fd1367af28d0a491',), msg='Implement ablation: current_code_only.')
CommitInfo(hash='ecdbdc3875e47887ff3c0320fd1367af28d0a491', parents=('ad918b35e2b8314f30a7f8ffc1e957c9f49956df',), msg='Exclude builtins defs in ctx by default.')


We can then extract `FIMProblem` instances from these commits using `edits_from_commit_history` function by specifying `C3CompletionGenerator` as the change_processor.

In [3]:
from coeditor.scoped_changes import edits_from_commit_history
from coeditor.experiments.code_completion import FIMProblem, C3CompletionGenerator

generator = C3CompletionGenerator()

workdir = proj_root() / "../temp-1"
fim_problems = edits_from_commit_history(
    repo_root, commits, workdir, change_processor=generator
)
print(f"{len(fim_problems) = }")

building initial project: 100%|██████████| 34/34 [00:00<00:00, 45.07it/s]
processing commits: 100%|██████████| 3/3 [00:04<00:00,  1.41s/it]

len(fim_problems) = 9





We can now visualize an example problem by first converting it into the input-output format used by CodeT5. Each problem instance asks the model to predict a missing line extracted that correspond to the last added line from the actual changes made to a given code region. Note that in this format, any previous changes made by the user are directly applied to the code that surrounds the missing line.

Feel free to change `ex_id` to see other examples.

In [4]:
from coeditor.encoding import decode_tokens


ex_id = 3
input, output = fim_problems[ex_id].to_codet5_format()

# we use decode_tokens to convert the token sequences into strings
print_sections(
    ("output", decode_tokens(output)),
    ("input", decode_tokens(input)),
)

--------------------------------------------------------------------------------
output:
<s><extra_id_0>                seg = seg + origin_line + [Newline_id]</s>
--------------------------------------------------------------------------------
input:
<s>        if add_bos and sec:
            sec[0] = BOS_id
    else:
        assert_eq(direction, TruncateAt.Right)
        if inplace:
            del sec[limit:]
        else:
            sec = sec[:limit]
        if add_bos and sec:
            sec[-1] = EOS_id
    return sec

def truncate_sections(
    total_limit: int,
    *sections: tuple[TokenSeq, TruncateAt.Value],
    add_bos: bool,
    inplace: bool = False,
) -> tuple[TokenSeq,...]:
    """Truncate a list of token sequences to fit within a total length limit.
    Earlier sections have priority over later sections.
    """

    # first, reserve equal space to each section
    section_lens = [total_limit // len(sections) for _ in sections]
    remaining = total_limit
    for i, (t

Now let's compare this with a different format that models this problem as a special case of code editing. To do that, we will run `edits_from_commit_history` again but with `C3ProblemGenerator` as the `change_processor`. This will give us `C3Problem` instances, which correspond to general contextual code change prediction problem. We can then convert them into instances that are similar to  similar to the `FIMProblem` problems above using `C3ToCodeCompletion`.  will then convert into  we then convert them into 

In [5]:
from coeditor.c3problem import C3ProblemGenerator, C3ToCodeCompletion

c3_problems = edits_from_commit_history(
    repo_root, commits, workdir, change_processor=C3ProblemGenerator()
)

transform = C3ToCodeCompletion()
comp_probs = join_list([transform.transform(p) for p in c3_problems])
print(f"{len(comp_probs) = }")

building initial project: 100%|██████████| 34/34 [00:00<00:00, 152.32it/s]
processing commits: 100%|██████████| 3/3 [00:09<00:00,  3.13s/it]

len(comp_probs) = 5





In [8]:
from coeditor.c3problem import C3ProblemTokenizer


tknizer = C3ProblemTokenizer(max_ref_tks_sum=2000)
print(tknizer.tokenize_problem(comp_probs[ex_id]).show())

--------------------------------------------------------------------------------
path: scripts.train_model/train_model
n_references: 4
total_reference_tks: 1644
project: temp-1
commit: CommitInfo(hash='a1e2b73ab836924d0b1f9ed88e4fd90e7a6f61e6', parents=('ecdbdc3875e47887ff3c0320fd1367af28d0a491',), msg='Implement ablation: current_code_only.')
 <0>:<add>     eval_tkn.max_query_tks = 1024

      <s>train_model
      def train_model(
          model_name: str,
          dataset_name: str,
          description: str,
          encoder: C3CombinedEncoder = C3CombinedEncoder(),
          batch_args=BatchArgs.train_default(),
          eval_batch_args=BatchArgs.eval_default(),
          train_args=TrainingArgs(),
          recreate_data: bool = False,
          resumed_from: Path | None = None,
          eval_only: bool = False,
          quicktest: bool = False,
      ):
      <s>_from is None:
              model = RetrievalEditorModel.from_code_t5("base", reuse_embed=True)
          else: