In [3]:
import difflib
from fileinput import filename
from unittest import case
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pydriller import Repository
import pandas as pd
import subprocess
import re
from tqdm import tqdm

In [4]:
urls = ["https://github.com/jax-ml/jax"]
max_limit = 500
bugs = ["bug", "fix", "issue", "error", "problem", "fail", "exception", "crash", "fault", "defect", "refactor", "resolved"]
repo_path = "../LAB4/repos/JAX"
# Compile regex pattern (word boundaries, case-insensitive)
bug_pattern = re.compile(r"\b(" + "|".join(bugs) + r")\b", re.IGNORECASE)

def is_bug_fix_commit(message: str) -> bool:
    """Return True if commit message relates to bug fixing."""
    return bool(bug_pattern.search(message))

Identify bug-fixing commits from the repository. The strategy to define the
notion of a bug as well as how to identify the corresponding commit, should
be defined by you.
-  For each bug-fixing commit, store the following information (in csv format).
    - Hash | Message | Hashes of parents | Is a merge commit? | List of modified files

In [37]:
COMMIT_HISTORY = "bug_fixing_commit.csv"
data = {
    'Hash': [],
    'Parent Hash': [],
    'Message': [],
    'Is a merge commit?': [],
    'Modified File': []
}

In [38]:
for i, commit in enumerate(Repository(repo_path).traverse_commits()):
    if is_bug_fix_commit(commit.msg):
        parent_hashes = commit.parents or ["<no-parent>"]
        modified_files = [f.filename for f in commit.modified_files] or ["<no-file>"]

        for p in parent_hashes:
            for mf in modified_files:
                data['Hash'].append(commit.hash)
                data['Parent Hash'].append(p)
                data['Message'].append(commit.author.name + ": " + commit.msg)
                data['Is a merge commit?'].append(commit.merge)
                data['Modified File'].append(mf)

# Create the DataFrame and save it to a CSV file
if data['Hash']:
    data_df = pd.DataFrame(data)
    data_df.to_csv(COMMIT_HISTORY, index=False)
    print(f"\nSuccessfully generated {COMMIT_HISTORY} with {len(data['Hash'])} bug fix commit entries.")
else:
    print("\nNo bug fix commits found within the specified limit.")


Successfully generated bug_fixing_commit.csv with 13108 bug fix commit entries.


Diff Extraction and Analyses:
- For each modified file (in the previous step), store the following (as csv).

Hash

Message 

Filename

Source
Code
(before)

Source
Code
(current)

Diff LLM Inference
(fix type)

Rectified
Message
... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...

In [5]:
DIFF_EXTRACTION = "COMMIT_DIFF_ANALYSIS.csv"
data = {
    "Hash": [],
    "Message": [],
    "Filename": [],
    "Source Code (before)": [],
    "Source Code (current)": [],
    "Diff": [],
    # "LLM_Inference": [],
    # "Rectifier": []
}

In [8]:
def get_file_content(commit_hash, filepath):
    """Get file content at a specific commit hash (text or binary)."""
    if not filepath:
        return ""
    try:
        raw = subprocess.check_output(
            ["git", "-C", repo_path, "show", f"{commit_hash}:{filepath}"],
            stderr=subprocess.DEVNULL
        )
        # Try UTF-8 first
        try:
            return raw.decode("utf-8")
        except UnicodeDecodeError:
            return "<binary file>"
    except subprocess.CalledProcessError:
        return ""  # File may not exist in that commit

def get_diff(parent_sha, commit_sha, old_path, new_path):
    """Get diff between parent and commit for the given file."""
    if not old_path and not new_path:
        return ""
    try:
        raw = subprocess.check_output(
            [
                "git", "-C", repo_path, "diff", "--diff-algorithm=histogram", "--full-index",
                f"{parent_sha}:{old_path or new_path}",
                f"{commit_sha}:{new_path or old_path}"
            ],
            stderr=subprocess.DEVNULL
        )
        return raw.decode("utf-8", errors="replace")  # avoid crash on weird encodings
    except subprocess.CalledProcessError:
        return ""  # Diff not available


In [9]:
commits = list(Repository(repo_path).traverse_commits())[:1000]

for commit in tqdm(commits, desc="Analyzing commits", unit="commit"):
    if is_bug_fix_commit(commit.msg):
        parent_hashes = commit.parents or ["<no-parent>"]
        for mod in commit.modified_files:
            for parent_sha in parent_hashes:
                old_code = get_file_content(parent_sha, mod.old_path)
                new_code = get_file_content(commit.hash, mod.new_path)
                diff = get_diff(parent_sha, commit.hash, mod.old_path, mod.new_path)

                data["Hash"].append(commit.hash)
                data["Message"].append(commit.author.name + ": " + commit.msg)
                data["Filename"].append(mod.new_path or mod.old_path or "<unknown>")
                data["Source Code (before)"].append(old_code)
                data["Source Code (current)"].append(new_code)
                data["Diff"].append(diff)

# Save DataFrame to CSV
if data["Hash"]:
    df = pd.DataFrame(data)
    df.to_csv(DIFF_EXTRACTION, index=False)
    print(f"\n✅ Successfully generated {DIFF_EXTRACTION} with {len(data['Hash'])} file-level entries.")
else:
    print("\n No bug fix commit details found.")

Analyzing commits: 100%|██████████| 1000/1000 [00:03<00:00, 262.17commit/s]



✅ Successfully generated COMMIT_DIFF_ANALYSIS.csv with 262 file-level entries.


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

def get_inference(diff_text):
    if not isinstance(diff_text, str) or not diff_text.strip():
        return ""
    inputs = tokenizer(diff_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [3]:
input_csv = "COMMIT_DIFF_ANALYSIS.csv"
df = pd.read_csv(input_csv)

df["LLM Inference"] = df["Diff"].apply(get_inference)

# Save updated CSV
output_csv = "big_commits_with_inference.csv"
df.to_csv(output_csv, index=False)

print(f"✅ Added 'LLM Inference' column. Saved as {output_csv}")


: 