# Statistics for Commit Message Generation dataset from Long Code Arena

In [2]:
from datasets import load_dataset

df = load_dataset(
    "JetBrains-Research/lca-commit-message-generation", "commitchronicle-py-long", split="test"
).to_pandas()
df.head()

Unnamed: 0,hash,repo,date,license,message,mods
0,c27d31c06520c3df4c820ea10d5d16316f4d88cb,cupy/cupy,19.07.2017 16:24:41,MIT License,"Support CUDA stream on memory pool\n\nNow, mem...","[{'change_type': 'MODIFY', 'old_path': 'cupy/c..."
1,6683a9aa7bae67e855cd9d1f17fdc49eb3f6dea0,cupy/cupy,17.06.2020 22:41:09,MIT License,Complete overhaul of filter testing.\n\nThese ...,"[{'change_type': 'MODIFY', 'old_path': 'tests/..."
2,dad51485282b6e05c4993b0733bd54aa3c0bacef,cupy/cupy,12.01.2021 16:21:46,MIT License,"Use ""import numpy as np"" in the array_api subm...","[{'change_type': 'MODIFY', 'old_path': 'numpy/..."
3,76eb888612183768d9e1b0c818fcf5416c5f28c7,cupy/cupy,20.01.2021 18:25:20,MIT License,Use _implementation on all functions that have...,"[{'change_type': 'MODIFY', 'old_path': 'numpy/..."
4,994ce07595026d5de54f52ef5748b578f9fae1bc,cupy/cupy,09.07.2021 13:57:44,MIT License,Use better type signatures in the array API mo...,"[{'change_type': 'MODIFY', 'old_path': 'numpy/..."


In [3]:
len(df)

163

In [4]:
df.repo.nunique()

34

## Commit Messages

In [5]:
df["num_characters_msg"] = df.message.str.len()
df["num_words_msg"] = df.message.str.split(" ").str.len()
df["num_lines_msg"] = df.message.str.split("\n").str.len()

df[["num_characters_msg", "num_words_msg", "num_lines_msg"]].describe(
    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_characters_msg,163.0,199.957055,63.854614,86.0,96.1,114.0,123.2,148.0,188.0,242.0,290.0,315.9,355.8,367.0
num_words_msg,163.0,28.797546,9.930101,9.0,11.0,15.0,16.0,21.5,28.0,36.0,43.0,45.9,51.38,58.0
num_lines_msg,163.0,4.638037,1.400274,2.0,2.0,3.0,3.0,4.0,4.0,5.0,7.0,7.0,8.38,9.0


## Diffs

In [6]:
df["num_modified_files"] = df.mods.str.len()
df["num_characters_diff"] = [sum(len(mod["diff"]) for mod in mods) for mods in df.mods]
df["num_words_diff"] = [sum(len(mod["diff"].split(" ")) for mod in mods) for mods in df.mods]
df["num_lines_diff"] = [sum(len(mod["diff"].split("\n")) for mod in mods) for mods in df.mods]

df[["num_modified_files", "num_characters_diff", "num_words_diff", "num_lines_diff"]].describe(
    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_modified_files,163.0,3.417178,2.61717,1.0,1.0,1.0,1.0,2.0,3.0,4.0,7.0,8.9,12.14,15.0
num_characters_diff,163.0,8697.766871,6055.710918,3346.0,3363.2,3470.5,3825.2,4863.5,6639.0,10020.0,17010.4,21030.3,27513.18,41714.0
num_words_diff,163.0,2086.349693,1553.492646,388.0,608.46,720.7,823.6,1039.0,1608.0,2414.0,4077.6,4842.8,7756.82,9645.0
num_lines_diff,163.0,212.453988,146.325791,67.0,72.48,81.1,92.8,113.5,159.0,248.0,413.8,499.3,710.58,864.0


## Files

### Utils

#### Downloading repositories

In [15]:
import os

data_dir = "../data"

In [None]:
import tarfile

from huggingface_hub import hf_hub_download, list_repo_tree

for repo_file in list_repo_tree("JetBrains-Research/lca-commit-message-generation", "repos", repo_type="dataset"):
    file_path = hf_hub_download(
        repo_id="JetBrains-Research/lca-commit-message-generation",
        filename=repo_file.path,
        repo_type="dataset",
        local_dir=data_dir,
    )

    with tarfile.open(file_path, "r:gz") as tar:
        tar.extractall(path=os.path.join(data_dir, "extracted_repos"))

### Helper function

In [24]:
from collections import defaultdict
from typing import Dict

import git


def get_changed_files_before_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:
    repo = git.Repo(repo_path)
    repo.git.checkout("HEAD", ".")
    repo.git.clean("-fd")
    commit = repo.commit(commit_hash)

    if len(commit.parents) > 1:
        raise ValueError("More than one parent")

    changed_files = list(commit.stats.files.keys())
    try:
        repo.git.checkout(commit.parents[0].hexsha)
    except git.GitCommandError as e:
        print(os.path.basename(repo_path), commit_hash, e)
        if repo.is_dirty(untracked_files=True):
            repo.git.stash("save", "--include-untracked")
            repo.git.clean("-fd")
            repo.git.checkout(commit.parents[0].hexsha)

    stats = defaultdict(int)
    for file_path in changed_files:
        try:
            with open(os.path.join(repo_path, file_path), "r") as file:
                content = file.read()
                stats["num_chars"] += len(content)
                stats["num_words"] += len(content.split(" "))
                stats["num_lines"] += len(content.split("\n"))
        except FileNotFoundError:
            print(
                f"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}"
            )

    repo.git.checkout("HEAD", ".")
    return stats


def get_changed_files_after_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:
    repo = git.Repo(repo_path)
    repo.git.checkout("HEAD", ".")
    repo.git.clean("-fd")
    commit = repo.commit(commit_hash)
    changed_files = list(commit.stats.files.keys())
    try:
        repo.git.checkout(commit_hash)
    except git.GitCommandError as e:
        print(os.path.basename(repo_path), commit_hash, e)
        repo.git.stash("save", "--include-untracked")
        repo.git.clean("-fd")
        repo.git.checkout(commit_hash)

    stats = defaultdict(int)
    for file_path in changed_files:
        try:
            with open(os.path.join(repo_path, file_path), "r") as file:
                content = file.read()
                stats["num_chars"] += len(content)
                stats["num_words"] += len(content.split(" "))
                stats["num_lines"] += len(content.split("\n"))
        except FileNotFoundError:
            print(
                f"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}"
            )

    repo.git.checkout("HEAD", ".")
    return stats


def get_changed_files_full_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:
    stats_before = get_changed_files_before_commit_stats(repo_path, commit_hash)
    stats_after = get_changed_files_after_commit_stats(repo_path, commit_hash)
    for key in stats_before:
        stats_before[key] += stats_after[key]
    return stats_before


def get_all_files_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:
    repo = git.Repo(repo_path)
    repo.git.checkout("HEAD", ".")
    repo.git.clean("-fd")
    commit = repo.commit(commit_hash)
    try:
        repo.git.checkout(commit_hash)
    except git.GitCommandError as e:
        print(os.path.basename(repo_path), commit_hash, e)
        if repo.is_dirty(untracked_files=True):
            repo.git.stash("save", "--include-untracked")
            repo.git.clean("-fd")
            repo.git.checkout(commit_hash)

    stats = defaultdict(int)

    for blob in commit.tree.traverse():
        if blob.type == "blob":
            try:
                with open(os.path.join(repo_path, str(blob.path)), "r") as file:
                    content = file.read()
                    stats["num_chars"] += len(content)
                    stats["num_words"] += len(content.split(" "))
                    stats["num_lines"] += len(content.split("\n"))
            except Exception:
                continue

    repo.git.checkout("HEAD", ".")
    return stats

In [25]:
from tqdm import tqdm

all_file_stats = []

for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):
    all_file_stats.append(get_all_files_stats(os.path.join(data_dir, "extracted_repos", repo.replace("/", "__")), hash))

 21%|██▏       | 35/163 [00:11<00:56,  2.27it/s]

mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout f21685a83330a4bbe1e59c3641a0d24f1efe8825
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	test cases/fortran/2 modules/mymod.F90
Please move or remove them before you switch branches.
Aborting'


 79%|███████▊  | 128/163 [01:13<00:04,  7.74it/s]

altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout 846a842a6dbd6c7f989bff5232c697be94ffb7b1
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	doc/user_guide/API.rst
Please move or remove them before you switch branches.
Aborting'


100%|██████████| 163/163 [01:30<00:00,  1.80it/s]


In [26]:
from tqdm import tqdm

changed_files_before_commit_stats = []

for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):
    changed_files_before_commit_stats.append(
        get_changed_files_before_commit_stats(os.path.join(data_dir, "extracted_repos", repo.replace("/", "__")), hash)
    )

 22%|██▏       | 36/163 [00:13<00:45,  2.80it/s]

mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout 6f3f43bb2d31797b0f3128e1664652571fe314e6
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	test cases/fortran/2 modules/mymod.F90
Please move or remove them before you switch branches.
Aborting'


 79%|███████▊  | 128/163 [00:46<00:05,  6.99it/s]

altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout ea9ae53a60a7fbb0516ea020c5c0846f479d2546
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	doc/user_guide/API.rst
Please move or remove them before you switch branches.
Aborting'
File altair/vegalite/v5/tests/test_api.py before commit not found for commit 846a842a6dbd6c7f989bff5232c697be94ffb7b1 in repo altair-viz__altair


100%|██████████| 163/163 [00:56<00:00,  2.88it/s]


In [29]:
from tqdm import tqdm

changed_files_after_commit_stats = []

for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):
    try:
        changed_files_after_commit_stats.append(
            get_changed_files_after_commit_stats(
                os.path.join(data_dir, "extracted_repos", repo.replace("/", "__")), hash
            )
        )
    except git.GitCommandError:  # TODO: idk what's happening here
        continue

 22%|██▏       | 36/163 [00:10<00:41,  3.04it/s]

mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout f21685a83330a4bbe1e59c3641a0d24f1efe8825
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	test cases/fortran/2 modules/mymod.F90
Please move or remove them before you switch branches.
Aborting'


 79%|███████▊  | 128/163 [00:39<00:04,  7.37it/s]

altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout 846a842a6dbd6c7f989bff5232c697be94ffb7b1
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	doc/user_guide/API.rst
Please move or remove them before you switch branches.
Aborting'


100%|██████████| 163/163 [00:49<00:00,  3.30it/s]


In [30]:
from tqdm import tqdm

changed_files_full_stats = []

for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):
    try:
        changed_files_full_stats.append(
            get_changed_files_full_stats(os.path.join(data_dir, "extracted_repos", repo.replace("/", "__")), hash)
        )
    except git.GitCommandError:
        continue

 21%|██▏       | 35/163 [00:15<01:05,  1.94it/s]

mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout 6f3f43bb2d31797b0f3128e1664652571fe314e6
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	test cases/fortran/2 modules/mymod.F90
Please move or remove them before you switch branches.
Aborting'


 22%|██▏       | 36/163 [00:16<01:08,  1.86it/s]

mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout f21685a83330a4bbe1e59c3641a0d24f1efe8825
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	test cases/fortran/2 modules/mymod.F90
Please move or remove them before you switch branches.
Aborting'


 79%|███████▊  | 128/163 [01:04<00:08,  3.93it/s]

altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout ea9ae53a60a7fbb0516ea020c5c0846f479d2546
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	doc/user_guide/API.rst
Please move or remove them before you switch branches.
Aborting'
File altair/vegalite/v5/tests/test_api.py before commit not found for commit 846a842a6dbd6c7f989bff5232c697be94ffb7b1 in repo altair-viz__altair
altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)
  cmdline: git checkout 846a842a6dbd6c7f989bff5232c697be94ffb7b1
  stderr: 'error: The following untracked working tree files would be overwritten by checkout:
	doc/user_guide/API.rst
Please move or remove them before you switch branches.
Aborting'


100%|██████████| 163/163 [01:20<00:00,  2.02it/s]


### Statistics

#### Full repositories

In [31]:
import pandas as pd

pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.DataFrame(all_file_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_chars,163.0,27551314.64,45993503.29,143958.0,150366.42,433326.4,735432.2,1575603.5,3621059.0,16665826.0,123637940.4,125353336.0,139253525.6,156086294.0
num_words,163.0,5298042.83,8981031.25,30055.0,31357.42,101402.4,155376.2,380582.5,714079.0,4428093.5,19158339.4,22839020.9,35143839.36,35146112.0
num_lines,163.0,737517.1,1348961.81,4116.0,4347.1,13112.6,21951.8,45627.5,102059.0,339901.5,3947990.2,3965309.4,4011263.44,5036935.0


#### Changed files (before commit)

In [32]:
import pandas as pd

pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.DataFrame(changed_files_before_commit_stats).describe(
    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_chars,163.0,110043.72,128096.72,3694.0,4877.34,8137.5,12402.4,26225.5,58855.0,150216.5,267828.2,410150.1,546357.64,690367.0
num_words,163.0,29335.04,38953.63,731.0,888.28,2031.2,3129.8,6275.5,13917.0,37355.5,73427.4,105687.1,173032.2,259090.0
num_lines,163.0,2754.34,2979.61,80.0,157.66,253.8,320.6,718.5,1495.0,3986.5,6895.8,9269.6,13248.38,15599.0


#### Changed files (after commit)

In [33]:
import pandas as pd

pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.DataFrame(changed_files_after_commit_stats).describe(
    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_chars,161.0,109222.15,127191.31,4161.0,5434.0,9549.0,12661.0,26524.0,57502.0,146896.0,261596.0,411628.0,552013.6,691296.0
num_words,161.0,29055.65,38812.42,808.0,1023.8,1896.0,2957.0,6293.0,13875.0,36675.0,67751.0,109455.0,175884.8,259183.0
num_lines,161.0,2741.02,2964.8,95.0,179.6,270.0,349.0,738.0,1520.0,3960.0,6726.0,9287.0,13337.4,15620.0


#### Changed files (full)

In [34]:
import pandas as pd

pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.DataFrame(changed_files_full_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_chars,161.0,217227.4,254058.25,8093.0,11028.8,19504.0,25150.0,50868.0,116308.0,293780.0,524357.0,821099.0,1102607.8,1381663.0
num_words,161.0,57859.11,77569.59,1718.0,2021.8,4220.0,5555.0,12572.0,27646.0,73037.0,135488.0,215250.0,351514.8,518273.0
num_lines,161.0,5450.88,5924.83,175.0,359.2,568.0,654.0,1449.0,2994.0,7918.0,13493.0,18568.0,26650.8,31219.0
