In [None]:
import pandas as pd
from openai import OpenAI
from datetime import datetime
import time
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.metrics.pairwise import cosine_distances
import json
from dotenv import load_dotenv
import os
from tokens_utils import *

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
token = os.getenv("GITHUB_TOKEN")
url = 'https://api.github.com/graphql'
headers = {"Authorization": f"Bearer {token}"}

In [2]:
def classify_issue(number, title, body, comments):
    prompt = f"""
As a highly skilled software engineer and open source repository maintainer, your task is to categorize a closed GitHub issue into one of the following categories: 'bug', 'feature', 'question', 'build issue', 'documentation', or 'other'. Make your determination based on the issue’s title, body, and any available comments.

Begin with a concise checklist (3-7 bullets) of what you will do; keep items conceptual, not implementation-level. After categorization, verify that your choice aligns with the schema, and if not, clarify any ambiguity before finalizing.

Checklist (conceptual steps):
- Review the title, body, and comments for evidence supporting any category.
- Match findings against the provided category definitions.
- Identify keywords or phrases in each section that support classification.
- Assess whether the issue was fixed (if applicable) and note any relevant commit or PR references.
- Check whether the issue is a duplicate and identify the original issue number if so.
- If comments are absent, rely solely on the title and body and document any resulting limitations.

## Issue Details
Title: {title}
Body: {body}
Comments: {comments}

## Category Definitions
- **bug:** Reports unexpected or faulty behavior requiring developer intervention. Examples: "unexpected error", "crash when...", "fails to...", "does not work as expected". Intent or expected result may be described. Maintainers may confirm, provide a fix, or reference a commit/PR. Bugs refer to runtime/functional issues and not build/compilation errors.
- **feature:** Requests improvements or new functionality not tied to defects. Indicates a desired enhancement or new outcome. Examples: "add support for...", "feature request", "implement...".
- **question:** Inquires about usage, behavior, or outcomes; resolves with clarification, often due to misconfiguration or misunderstanding. Examples: "how do I...", "usage question", "is it correct behavior?", "Am I missing something or is it a bug?".
- **build issue:** Involves problems building or compiling, often with configuration details. Solutions may require new configs or versions. If the resolution necessitated code changes (beyond build configs), categorize as 'bug'. Examples: "build fails with...", "compilation error", "cannot install". Build issues are specific to installation, compilation, or setup processes.
- **documentation:** Relates to outdated info, errors, typos, or inconsistencies between docs and code. May reference PRs, but not code-level defects. Examples: "documentation outdated", "clarify docs", "readme error".
- **other:** Use only if none of the above categories apply.

## Instructions
- Clearly explain your reasoning for the chosen category.
- Cite specific evidence from the issue’s title, body, and (if applicable) comments to support the classification.
- Indicate if the issue was fixed, and provide the relevant commit or PR reference, or an empty string if none exists.
- If the issue is a duplicate, specify the original issue number; otherwise, use null.
- If comments are missing, base your rationale on the available information and declare the limitation.

After categorization, validate that your classification, rationale, and evidence conform exactly to the required output schema. Revise or adjust before final response if validation fails.

## Output Format
Return a JSON object strictly following this schema and key order:

{{
    "classification": string,           // One of: 'bug', 'feature', 'question', 'build issue', 'documentation', 'other'
    "rationale": string,                // Concrete reason for your classification decision in one sentence
    "fixed": boolean,                   // True if the issue was fixed, false otherwise
    "fix_commit_or_pr": string,         // Link/reference to fix commit or PR, or empty string
    "duplicate_of": string|null         // Original issue number if duplicate, null otherwise
}}

If a field (title, body, or comments) is empty or missing, summarize with what's present and mention any limitations in your rationale or evidence summaries. Maintain the specified key and field order.

Example Response:
{{
    "classification": "bug",
    "rationale": "The issue describes a wrong call of power instead of square for structured arrays.",
    "fixed": true,
    "fix_commit_or_pr": "PR #29392",
    "duplicate_of": null
}}

Please ensure your final response is a valid JSON object that adheres strictly to the schema and key order provided above.
    """

    response = client.chat.completions.create(
        model="gpt-5-mini-2025-08-07",
        messages=[
        {"role": "system", "content": "You are an expert software developer"},
        {"role": "user", "content": prompt}
        ],
        response_format={ "type": "json_object" },
        temperature=1
    )
    
    return number, response.choices[0].message.content

def execute_model_for_issues(df, repo):
    BATCH_SIZE = 100
    MAX_WORKERS = 64
    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = []
        for idx, row in df.iterrows():
            futures.append(executor.submit(classify_issue, row["number"], row['title'], row['bodyText'], row['comments']))
        
        for future in as_completed(futures):
            number, classification_result = future.result()
            results.append(json.loads(f'{{"number": {number}, "classification": {classification_result}}}'))
            if len(results) % BATCH_SIZE == 0:
                print(f"Processed {len(results)} issues.")
                export_classification_to_json(results, f"{repo}_{len(results)}")
            time.sleep(2) 
    return results

def export_classification_to_json(results, repo):
    output_file = f"{repo}_classification.json"
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)
    print(f"Classification results saved to {output_file}")

In [None]:
# We recommend to do this analysis project by project. 
repo = "pytorch"
df_issues = pd.read_pickle(f"issues_per_project/{repo}.pkl")
results = execute_model_for_issues(df_issues, repo)
export_classification_to_json(results, f"{repo}_{len(results)}")

# The results of this categorization can be found in json files per project
# Go to: RQ1/issues_classification_json/ or RQ1/issues_classification_csv/

Processed 100 issues.
Classification results saved to pytorch_100_classification.json
Processed 200 issues.
Classification results saved to pytorch_200_classification.json
Processed 300 issues.
Classification results saved to pytorch_300_classification.json
Processed 400 issues.
Classification results saved to pytorch_400_classification.json
Processed 500 issues.
Classification results saved to pytorch_500_classification.json
Processed 600 issues.
Classification results saved to pytorch_600_classification.json
Processed 700 issues.
Classification results saved to pytorch_700_classification.json
Processed 800 issues.
Classification results saved to pytorch_800_classification.json
Processed 900 issues.
Classification results saved to pytorch_900_classification.json
Processed 1000 issues.
Classification results saved to pytorch_1000_classification.json
Processed 1100 issues.
Classification results saved to pytorch_1100_classification.json
Processed 1200 issues.
Classification results save