<a href="https://colab.research.google.com/github/ShantKhatri/aqa-triage-data/blob/create_finetuning_data/CommitHunter/create_finetuning_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!git clone https://github.com/eclipse-openj9/openj9.git
%cd openj9

Cloning into 'openj9'...
remote: Enumerating objects: 293519, done.[K
remote: Counting objects: 100% (171/171), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 293519 (delta 116), reused 78 (delta 78), pack-reused 293348 (from 3)[K
Receiving objects: 100% (293519/293519), 193.10 MiB | 14.12 MiB/s, done.
Resolving deltas: 100% (222549/222549), done.
Updating files: 100% (10320/10320), done.
/content/openj9


In [None]:
import json
import subprocess
import os
import textwrap

def get_commit_message(sha):
    """Fetches the subject line of a commit message."""
    try:
        return subprocess.check_output(
            ["git", "show", "-s", "--format=%s", sha]
        ).decode().strip()
    except subprocess.CalledProcessError:
        return "[Could not fetch commit message]"

def generate_analysis_prompt(good_sha, bad_sha, failure_logs):
    """
    Generates the detailed prompt string by querying the git repository for commit data
    between the good and bad SHAs.
    """
    try:
        commit_list = subprocess.check_output(
            ["git", "rev-list", "--reverse", "--no-merges", f"{good_sha}..{bad_sha}"]
        ).decode().strip().splitlines()
    except subprocess.CalledProcessError:
        print(f"Warning: Could not find commits between {good_sha} and {bad_sha}. Skipping.")
        return None

    prompt_lines = [
        "==== INPUT FOR COMMIT HUNTER (SHORT VERSION) ====\n",
        f"Good Build SHA: {good_sha}",
        f"Bad Build SHA:  {bad_sha}\n",
        f"Analysis of Commits Between {good_sha} and {bad_sha}",
        "=" * 60 + "\n"
    ]

    for i, sha in enumerate(commit_list):
        commit_details_format = "commit:%H%nCommit message: %s%nAuthor Name: %an"
        commit_details = subprocess.check_output(
            ["git", "show", "-s", f"--format={commit_details_format}", sha]
        ).decode().strip()
        prompt_lines.append(commit_details)

        files_changed_raw = subprocess.check_output(
            ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", sha]
        ).decode().strip()
        prompt_lines.append("Files Changed:")
        for file_line in files_changed_raw.splitlines():
            prompt_lines.append(f"  - {file_line}")

        diff_content = subprocess.check_output(
            ["git", "show", sha, "--unified=5"]
        ).decode()

        diff_start_index = diff_content.find("diff --git")
        if diff_start_index != -1:
            clean_diff = diff_content[diff_start_index:]
            prompt_lines.append("Content changed(diff):")
            prompt_lines.append(clean_diff[:4000])

        prompt_lines.append("\n" + "=" * 60 + "\n")

    prompt_lines.extend([
        "=== Failed Test Context ===\n",
        textwrap.dedent(failure_logs),
        "\n",
        "=== Prompt ===\n",
        textwrap.dedent("""
            Think step-by-step and explain your reasoning before giving the final answer.
            Based on the commits, changed files, content changed, test failure and stack trace,
            which commit is most likely responsible for the regression?
            Do major focus on the changed files and the contentchanged with respect to the failure trace we have, as commit message may some times
            not related as it totally dependson the developer, also we have some merge message so, better to focus on the content and files changed.
            TASK: Analyze the commits above and identify which specific commit is most likely responsible for causing a build failure in OpenJ9.

            RESPONSE FORMAT:
            1. Most Suspicious Commit: [commit hash and message(Important)]
            2. Risk Level: [High/Medium/Low]
            3. Reasoning: [detailed explanation]
            4. Recommendation: [what to investigate next]

            List about 2-3 suspected commits.
        """)
    ])

    return "\n".join(prompt_lines)

def generate_ideal_completion(culprit_sha, failure_logs, reasoning):
    """
    Generates the ideal 'assistant' response based on the ground truth culprit SHA
    and the reasoning provided.
    """
    culprit_message = get_commit_message(culprit_sha)

    # Use the reasoning from the JSON if available, otherwise generate a default.
    if reasoning and reasoning.strip():
        # Clean up the provided reasoning for better formatting
        lines = reasoning.strip().split('\n')
        cleaned_reasoning = "\n".join(line.strip() for line in lines)
    else:
        cleaned_reasoning = "A detailed analysis of the culprit commit is required to determine the root cause. The failure logs should be cross-referenced with the code changes in the commit."

    completion = textwrap.dedent(f"""
        1. Most Suspicious Commit: {culprit_sha} ("{culprit_message}")
        2. Risk Level: High
        3. Reasoning: {cleaned_reasoning}
    """).strip()

    return completion

def create_finetuning_data():
    """
    Main function to generate the finetuning JSONL file.
    Loads failure data from processed_prs.json
    """
    input_filename = "processed_prs.json"
    output_filename = "finetuning_data.jsonl"

    print(f"Loading failure data from '{input_filename}'...")
    with open(input_filename, "r") as f_in:
        failure_data_list = json.load(f_in)

    print(f"Starting fine-tuning data generation with {len(failure_data_list)} records...")

    with open(output_filename, "w") as f_out:
        for i, data in enumerate(failure_data_list):
            print(f"Processing record {i+1}/{len(failure_data_list)} for PR {data['pr']}...")

            prompt_content = generate_analysis_prompt(
                data['goodSHA'], data['badSHA'], data['failure_logs']
            )
            if prompt_content is None:
                continue

            completion_content = generate_ideal_completion(
                data['culprit_SHA'], data['failure_logs'], data.get('reasoning')
            )

            fine_tuning_record = {
                "prompt": prompt_content,
                "completion": completion_content
            }

            f_out.write(json.dumps(fine_tuning_record) + "\n")

    print(f"\Fine-tuning data successfully written to '{output_filename}'")


create_finetuning_data()

🚀 Starting fine-tuning data generation...
Processing record 1/1 for PR 22185...

✅ Fine-tuning data successfully written to 'finetuning_data.jsonl'


# The below script is to find the good sha and bad sha from the culprit commit.
## This will return the last commit of the previous day and the last commit of the current day

In [None]:
import os
import subprocess

target_sha = "7ee010c7af9d7a9415a3aace2ec2bcae0af808c9"

repo_url = "https://github.com/eclipse-openj9/openj9.git"
repo_dir = "openj9"

if not os.path.exists(repo_dir):
  print(f"Cloning repository: {repo_url}...")
  !git clone --quiet {repo_url}
else:
  print(f"Repository '{repo_dir}' already exists.")

def run_command(command, working_dir, timezone="America/New_York"):
  """Runs a shell command and returns its output or error."""
  env = os.environ.copy()
  env['TZ'] = timezone

  result = subprocess.run(
      command,
      cwd=working_dir,
      capture_output=True,
      text=True,
      shell=True,
      env=env
  )

  if result.returncode != 0:
    print(f"Error running command: {command}")
    return None, result.stderr.strip()

  return result.stdout.strip(), None

print(f"\nGetting date for target SHA: {target_sha[:10]}...")
date_command = f"git show -s --format=%cs {target_sha}"
commit_date, error = run_command(date_command, repo_dir)

if error:
  print(f"Error: {error}")
else:
  print(f"Target commit date (NY Time): {commit_date}")

  print("\nSearching for the last commit of the PREVIOUS day...")
  # ADDED --pretty=fuller to show both dates
  prev_day_cmd = f"git log --all --pretty=fuller --before='{commit_date} 00:00:00' -n 1"
  prev_day_info, prev_error = run_command(prev_day_cmd, repo_dir)

  if prev_error:
    print(f"Error: {prev_error}")
  elif prev_day_info:
    print("\n--- ✅ Last Commit of PREVIOUS Day ---")
    print(prev_day_info)
    print("---------------------------------------")
  else:
    print("No commits found for the previous day.")

  print("\nSearching for the last commit of the SAME day...")
  same_day_cmd = f"git log --all --pretty=fuller --since='{commit_date} 00:00:00' --until='{commit_date} 23:59:59' -n 1"
  same_day_info, same_error = run_command(same_day_cmd, repo_dir)

  if same_error:
    print(f"Error: {same_error}")
  elif same_day_info:
    print("\n--- ✅ Last Commit of SAME Day ---")
    print(same_day_info)
    print("-----------------------------------")
  else:
    print("No commits found for the same day.")

Cloning repository: https://github.com/eclipse-openj9/openj9.git...

Getting date for target SHA: 7ee010c7af...
Target commit date (NY Time): 2025-02-12

Searching for the last commit of the PREVIOUS day...

--- ✅ Last Commit of PREVIOUS Day ---
commit 07b21da3c485c5c433a8122e94873b17e5240c20
Author:     Daryl Maier <maier@ca.ibm.com>
AuthorDate: Tue Feb 11 22:23:31 2025 -0500
Commit:     Daryl Maier <maier@ca.ibm.com>
CommitDate: Tue Feb 11 22:23:31 2025 -0500

    Remove unnecessary includes of CS2 headers
    
    Signed-off-by: Daryl Maier <maier@ca.ibm.com>
---------------------------------------

Searching for the last commit of the SAME day...

--- ✅ Last Commit of SAME Day ---
commit 8c3feb95d0d7b2fc2cfcd2e9d87bdbbcdb18f358
Merge: 5d76f28aae 3d1cf31be7
Author:     Keith W. Campbell <keithc@ca.ibm.com>
AuthorDate: Wed Feb 12 17:54:38 2025 -0500
Commit:     GitHub <noreply@github.com>
CommitDate: Wed Feb 12 17:54:38 2025 -0500

    Merge pull request #21093 from theresa-m/fix_210