In [1]:
import os
import pandas as pd
from huggingface_hub import list_repo_tree, hf_hub_download


models = [os.path.basename(_.path) for _ in list_repo_tree("JetBrains-Research/lca-results", repo_type="dataset", path_in_repo="commit_message_generation/predictions")]
models_predictions = {}
for model in models:
    try:
        local_path = hf_hub_download("JetBrains-Research/lca-results", f"commit_message_generation/predictions/{model}/predictions.jsonl", repo_type="dataset")
    except Exception as e:
        prediction_file = [_.path for _ in list_repo_tree("JetBrains-Research/lca-results", repo_type="dataset", path_in_repo=f"commit_message_generation/predictions/{model}")][0]
        local_path = hf_hub_download("JetBrains-Research/lca-results", prediction_file, repo_type="dataset")
    df = pd.read_json(local_path, orient="records", lines=True)
    df = df.set_index(["repo", "hash"])
    models_predictions[model] = df

local_path = hf_hub_download("JetBrains-Research/lca-commit-message-generation", "commitchronicle-py-long/test-00000-of-00001-c91b0bca349ab4a3.parquet", repo_type="dataset")
df = pd.read_parquet(local_path)
df = df.set_index(["repo", "hash"])

In [2]:
models[:2], len(models)

(['DeepSeek-R1', 'DeepSeek-V3'], 31)

In [3]:
models_predictions["DeepSeek-R1"].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reference,prediction,created,model,system_fingerprint
repo,hash,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mesonbuild/meson,28e3ce67ae49494d57372f27b6f91580656f77a7,Convert test protocol into an enum\n\nThis giv...,"<think>\nOkay, I need to write a commit messag...",1738258751,deepseek-ai/DeepSeek-R1,
mycroftai/mycroft-core,b7d709c3c86af4f58cc8a8f7bbd089f319a0718b,Add wait_for_message() method to messagebus cl...,"<think>\nOkay, let's tackle this commit messag...",1738258758,deepseek-ai/DeepSeek-R1,
teemtee/tmt,dbffb035f72de8d43e5f04fc1ea0ce1b0da21e7a,Support selecting objects under the current fo...,"<think>\nOkay, let's look at this diff. The ma...",1738258759,deepseek-ai/DeepSeek-R1,
appscale/gts,6c368f596a1654a4866ab75db76759cd1f0aeb00,Support the service_account_name parameter\n\n...,"<think>\nOkay, I need to write a commit messag...",1738258759,deepseek-ai/DeepSeek-R1,
miurahr/aqtinstall,ed6cf7696871e01685c1c96d6d13deb6439ae490,Add CLI option to install desktop qt automatic...,"<think>\nOkay, let me figure out what the comm...",1738258760,deepseek-ai/DeepSeek-R1,


In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,license,message,mods
repo,hash,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cupy/cupy,c27d31c06520c3df4c820ea10d5d16316f4d88cb,19.07.2017 16:24:41,MIT License,"Support CUDA stream on memory pool\n\nNow, mem...","[{'change_type': 'MODIFY', 'diff': '@@ -1,4 +1..."
cupy/cupy,6683a9aa7bae67e855cd9d1f17fdc49eb3f6dea0,17.06.2020 22:41:09,MIT License,Complete overhaul of filter testing.\n\nThese ...,"[{'change_type': 'MODIFY', 'diff': '@@ -11,359..."
cupy/cupy,dad51485282b6e05c4993b0733bd54aa3c0bacef,12.01.2021 16:21:46,MIT License,"Use ""import numpy as np"" in the array_api subm...","[{'change_type': 'MODIFY', 'diff': '@@ -1,76 +..."
cupy/cupy,76eb888612183768d9e1b0c818fcf5416c5f28c7,20.01.2021 18:25:20,MIT License,Use _implementation on all functions that have...,"[{'change_type': 'MODIFY', 'diff': '@@ -35,7 +..."
cupy/cupy,994ce07595026d5de54f52ef5748b578f9fae1bc,09.07.2021 13:57:44,MIT License,Use better type signatures in the array API mo...,"[{'change_type': 'MODIFY', 'diff': '@@ -396,7 ..."


In [5]:
import random

random.seed(42)

NUM_SAMPLES = 500

sampled_data = []
already_sampled = set()
for _ in range(NUM_SAMPLES):
    sample_pair = random.sample(models, 2)
    sample_commit = random.sample(df.index.tolist(), 1)[0]
    pred_A = models_predictions[sample_pair[0]].loc[sample_commit]["prediction"]
    pred_B = models_predictions[sample_pair[1]].loc[sample_commit]["prediction"]
    commit = df.loc[sample_commit]

    if (sample_pair[0], sample_pair[1], sample_commit[0], sample_commit[1]) in already_sampled:
        continue
    
    sampled_data.append({"model_A_pred": pred_A, 
                         "model_B_pred": pred_B, 
                         "repo": sample_commit[0], 
                         "hash": sample_commit[1], 
                         "url": f"https://github.com/{sample_commit[0]}/commit/{sample_commit[1]}",
                         "model_A": sample_pair[0], 
                         "model_B": sample_pair[1]})
    
    already_sampled.add((sample_pair[0], sample_pair[1], sample_commit[0], sample_commit[1]))

sampled_df = pd.DataFrame(sampled_data)
sampled_df["id"] = sampled_df.index
sampled_df.head()

Unnamed: 0,model_A_pred,model_B_pred,repo,hash,url,model_A,model_B,id
0,Implement type promotion for array API\n\nThis...,Implement custom result type promotion.\n\nThe...,cupy/cupy,29535ad693507084ff3691fefd637a6b7292674f,https://github.com/cupy/cupy/commit/29535ad693...,gemini-pro-1.5,Llama-3.1-70B-Instruct-Turbo,0
1,Refactor audio and STT classes to use ABCMeta ...,Use `metaclass=ABCMeta` in class definitions\n...,mycroftai/mycroft-core,39c844a257ef934b244af28c882c1fe93f6fe61d,https://github.com/mycroftai/mycroft-core/comm...,gpt_3.5_turbo_16k_0613,Qwen2.5-Coder-32B-Instruct,1
2,Simplify system detection in environment initi...,**Simplify host system detection in Meson libr...,mesonbuild/meson,e0ed1ceae2e00d6c6efab39d4712d2522d89e929,https://github.com/mesonbuild/meson/commit/e0e...,QwQ-32B-Preview,Llama-3.1-8B-Instruct-Turbo,2
3,Add verbose mode and simplify JSON comparison ...,Update generate_combinations.py with improveme...,miurahr/aqtinstall,e3d28a803cdcd1f050dcbf8ea4ee4fa21954caba,https://github.com/miurahr/aqtinstall/commit/e...,gpt-4o-2024-11-20,gpt_3.5_turbo_16k_0613,3
4,Improve image guessing and Fedora release hand...,Refactor: Simplify image URL determination log...,teemtee/tmt,e2acc7e1b8d14cc3c1f48d9a6609c99f2043c756,https://github.com/teemtee/tmt/commit/e2acc7e1...,Llama-3.1-405B-Instruct-Turbo,deepseek-coder-6.7b-instruct,4


In [6]:
from huggingface_hub import upload_file, create_repo
from dotenv import load_dotenv

load_dotenv()

sampled_df.to_json("sampled_data.jsonl", orient="records", lines=True)
try:
    upload_file(
        repo_id="saridormi/lca-cmg-labels",
        repo_type="dataset",
        path_or_fileobj="sampled_data.jsonl",
        path_in_repo="sampled_data.jsonl",
    )
except Exception as e:
    create_repo(repo_id="saridormi/lca-cmg-labels", repo_type="dataset", private=True)
    upload_file(
        repo_id="saridormi/lca-cmg-labels",
        repo_type="dataset",
        path_or_fileobj="sampled_data.jsonl",
        path_in_repo="sampled_data.jsonl",
    )