In [3]:
import difflib
from fileinput import filename
from unittest import case
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pydriller import Repository
import pandas as pd
import subprocess
import re
from tqdm import tqdm

In [4]:
urls = ["https://github.com/jax-ml/jax"]
max_limit = 500
bugs = ["bug", "fix", "issue", "error", "problem", "fail", "exception", "crash", "fault", "defect", "refactor", "resolved"]
repo_path = "../LAB4/repos/JAX"
# Compile regex pattern (word boundaries, case-insensitive)
bug_pattern = re.compile(r"\b(" + "|".join(bugs) + r")\b", re.IGNORECASE)

def is_bug_fix_commit(message: str) -> bool:
    """Return True if commit message relates to bug fixing."""
    return bool(bug_pattern.search(message))

Identify bug-fixing commits from the repository. The strategy to define the
notion of a bug as well as how to identify the corresponding commit, should
be defined by you.
-  For each bug-fixing commit, store the following information (in csv format).
    - Hash | Message | Hashes of parents | Is a merge commit? | List of modified files

In [37]:
COMMIT_HISTORY = "bug_fixing_commit.csv"
data = {
    'Hash': [],
    'Parent Hash': [],
    'Message': [],
    'Is a merge commit?': [],
    'Modified File': []
}

In [38]:
for i, commit in enumerate(Repository(repo_path).traverse_commits()):
    if is_bug_fix_commit(commit.msg):
        parent_hashes = commit.parents or ["<no-parent>"]
        modified_files = [f.filename for f in commit.modified_files] or ["<no-file>"]

        for p in parent_hashes:
            for mf in modified_files:
                data['Hash'].append(commit.hash)
                data['Parent Hash'].append(p)
                data['Message'].append(commit.author.name + ": " + commit.msg)
                data['Is a merge commit?'].append(commit.merge)
                data['Modified File'].append(mf)

# Create the DataFrame and save it to a CSV file
if data['Hash']:
    data_df = pd.DataFrame(data)
    data_df.to_csv(COMMIT_HISTORY, index=False)
    print(f"\nSuccessfully generated {COMMIT_HISTORY} with {len(data['Hash'])} bug fix commit entries.")
else:
    print("\nNo bug fix commits found within the specified limit.")


Successfully generated bug_fixing_commit.csv with 13108 bug fix commit entries.


Diff Extraction and Analyses:
- For each modified file (in the previous step), store the following (as csv).

Hash

Message 

Filename

Source
Code
(before)

Source
Code
(current)

Diff LLM Inference
(fix type)

Rectified
Message
... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...

In [30]:
DIFF_EXTRACTION = "COMMIT_DIFF_ANALYSIS.csv"
data = {
    "Hash": [],
    "Message": [],
    "Filename": [],
    "Source Code (before)": [],
    "Source Code (current)": [],
    "Diff": [],
    # "LLM_Inference": [],
    # "Rectifier": []
}

In [8]:
def get_file_content(commit_hash, filepath):
    """Get file content at a specific commit hash (text or binary)."""
    if not filepath:
        return ""
    try:
        raw = subprocess.check_output(
            ["git", "-C", repo_path, "show", f"{commit_hash}:{filepath}"],
            stderr=subprocess.DEVNULL
        )
        # Try UTF-8 first
        try:
            return raw.decode("utf-8")
        except UnicodeDecodeError:
            return "<binary file>"
    except subprocess.CalledProcessError:
        return ""  # File may not exist in that commit

def get_diff(parent_sha, commit_sha, old_path, new_path):
    """Get diff between parent and commit for the given file."""
    if not old_path and not new_path:
        return ""
    try:
        raw = subprocess.check_output(
            [
                "git", "-C", repo_path, "diff", "--diff-algorithm=histogram", "--full-index",
                f"{parent_sha}:{old_path or new_path}",
                f"{commit_sha}:{new_path or old_path}"
            ],
            stderr=subprocess.DEVNULL
        )
        return raw.decode("utf-8", errors="replace")  # avoid crash on weird encodings
    except subprocess.CalledProcessError:
        return ""  # Diff not available


In [31]:
commits = list(Repository(repo_path).traverse_commits())[:200]

for commit in tqdm(commits, desc="Analyzing commits", unit="commit"):
    if is_bug_fix_commit(commit.msg):
        parent_hashes = commit.parents or ["<no-parent>"]
        for mod in commit.modified_files:
            for parent_sha in parent_hashes:
                old_code = get_file_content(parent_sha, mod.old_path)
                new_code = get_file_content(commit.hash, mod.new_path)
                diff = get_diff(parent_sha, commit.hash, mod.old_path, mod.new_path)

                data["Hash"].append(commit.hash)
                data["Message"].append(commit.author.name + ": " + commit.msg)
                data["Filename"].append(mod.new_path or mod.old_path or "<unknown>")
                data["Source Code (before)"].append(old_code)
                data["Source Code (current)"].append(new_code)
                data["Diff"].append(diff)

# Save DataFrame to CSV
if data["Hash"]:
    df = pd.DataFrame(data)
    df.to_csv(DIFF_EXTRACTION, index=False)
    print(f"\n✅ Successfully generated {DIFF_EXTRACTION} with {len(data['Hash'])} file-level entries.")
else:
    print("\n No bug fix commit details found.")

Analyzing commits: 100%|██████████| 200/200 [00:01<00:00, 106.22commit/s]



✅ Successfully generated COMMIT_DIFF_ANALYSIS.csv with 46 file-level entries.


In [32]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

# Batch inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_input_len = 256
max_output_len = 64

# Function for single inference
def run_inference(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_len).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_output_len)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply with tqdm
df["LLM Inference"] = [
    run_inference(diff) for diff in tqdm(df["Diff"].fillna(""), desc="Running inference", total=len(df))
]

# Save output
output_csv = "big_commits_with_inference.csv"
df.to_csv(output_csv, index=False)
print(f"✅ Done. Saved {output_csv}")


Running inference: 100%|██████████| 46/46 [01:26<00:00,  1.88s/it]

✅ Done. Saved big_commits_with_inference.csv





In [34]:
import groq

# Your Groq API key (move to env variable in real use for safety!)
GROQ_KEY = "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"

# Initialize Groq client once
groq_client = groq.Groq(api_key=GROQ_KEY)


In [35]:
def suggest_commit_msg(diff_text: str) -> str:
    """
    Use Groq API to propose a commit message from a given diff.
    If the API fails, fallback with a generic message.
    """
    if not isinstance(diff_text, str) or not diff_text.strip():
        return "Minor update"

    try:
        # Limit diff length to avoid long prompts
        snippet = diff_text[:3500]  # slightly smaller cutoff than roommate's code

        # Craft instruction
        user_prompt = (
            "Please summarize the following code modifications into a clear, "
            "concise commit message:\n\n"
            f"{snippet}\n\n"
            "Commit message:"
        )

        # Call Groq LLM
        resp = groq_client.chat.completions.create(
            model="llama3-8b-8192",   # different model (faster + cheaper)
            messages=[{"role": "user", "content": user_prompt}],
            max_tokens=25,           # allow longer summaries
            temperature=0.2,         # make more deterministic
            top_p=1.0
        )

        # Extract output safely
        msg = resp.choices[0].message.content.strip()
        return msg.strip('"') if msg else "Code changes applied"

    except Exception as err:
        print(f"[Groq Error] {err}")
        return "Code changes applied"


In [36]:
df = pd.read_csv("big_commits_with_inference.csv")

# Generate Rectifier column with tqdm progress
rectified_msgs = []
for diff_val in tqdm(df["Diff"].fillna(""), desc="Rectifying with Groq", total=len(df)):
    rectified_msgs.append(suggest_commit_msg(diff_val))

df["Rectifier"] = rectified_msgs

# Save the updated DataFrame
output_file = "big_commits_with_rectifier.csv"
df.to_csv(output_file, index=False)

print(f"✅ Finished. Saved {output_file}")

Rectifying with Groq:  11%|█         | 5/46 [00:00<00:03, 12.00it/s]

[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model 

Rectifying with Groq:  30%|███       | 14/46 [00:00<00:01, 24.84it/s]

[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model 

Rectifying with Groq:  50%|█████     | 23/46 [00:00<00:00, 31.97it/s]

[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model 

Rectifying with Groq:  74%|███████▍  | 34/46 [00:01<00:00, 41.01it/s]

[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model 

Rectifying with Groq: 100%|██████████| 46/46 [00:01<00:00, 31.85it/s]

[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
[Groq Error] Error code: 400 - {'error': {'message': 'The model 




In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Hash                   46 non-null     object
 1   Message                46 non-null     object
 2   Filename               46 non-null     object
 3   Source Code (before)   46 non-null     object
 4   Source Code (current)  45 non-null     object
 5   Diff                   45 non-null     object
 6   LLM Inference          45 non-null     object
 7   Rectifier              46 non-null     object
dtypes: object(8)
memory usage: 3.0+ KB
