In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from openai import OpenAI
import numpy as np
import time
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similaritysimilar_fixes
from dotenv import load_dotenv
import os
from tokens_utils import * 

Similarity Score Calculation for Issues

In [None]:
def embed_issue(text):
    response = client.embeddings.create(
                model="text-embedding-3-short",
                input=text
    )
    return response

def embed_issues(df):
    checkpoint_files = glob.glob("issues_embeddings_checkpoint_*.pkl")
    if checkpoint_files:
        latest_checkpoint = max(checkpoint_files, key=lambda x: int(x.split('_')[-1].split('.')[0]))
        print(f"Loading from checkpoint: {latest_checkpoint}")
        df = pd.read_pickle(latest_checkpoint)
        start_index = df[df["embedding"].isnull()].index.min()
    else:
        df["embedding"] = pd.Series([None]*len(df), dtype=object)
        start_index = 0

    for i in range(start_index, len(df)):
        text = df.at[i, "complete_text"]
        wait_time = 5
        try:
            response = embed_issue(text)
            df.at[i, "embedding"] = response.data[0].embedding
            if i % 100 == 0:
                df.to_pickle(f"issues_embeddings_checkpoint_{i}.pkl")
                print(f"Processed {i} issues, checkpoint saved.")
        except Exception as e:
            print(f"Error at index {i}: {e}. Retrying in {wait_time} seconds.")
            time.sleep(wait_time)
            wait_time = min(wait_time * 2, 60) 
    return df

def analyze_neighbors(sim_matrix, k, df, tfidf_matrix, embedding_matrix):
    neighbors = []
    for idx, row in df.iterrows():
        sim_scores = sim_matrix[idx]
        neighbor_indices = sim_scores.argsort()[-(k+1):-1][::-1]
        neighbor_info = [(df.at[n_idx, 'number'], sim_scores[n_idx].round(4), tfidf_matrix[idx][n_idx].round(4), embedding_matrix[idx][n_idx].round(4)) for n_idx in neighbor_indices]
        neighbors.append({
            'issue_number': row['number'],
            'neighbors': neighbor_info
        })
    return neighbors

def normalize_neighbors(neighbors_list, threshold=0.7):
    pairs = []
    for item in neighbors_list:
        issue = item['issue_number']
        for neighbor, score, tf_idf, embed in item['neighbors']:
            if score >= threshold:
                pairs.append((issue, neighbor.item(), score.item(), tf_idf.item(), embed.item()))
    return pairs


def erase_pair_duplicates(pairs):
    unique_pairs = set()
    for issue1, issue2, score, tfidf, embed in pairs:
        if issue1 < issue2:
            unique_pairs.add((issue1, issue2, score, tfidf, embed))
        else:
            unique_pairs.add((issue2, issue1, score, tfidf, embed))
    return list(unique_pairs)

In [None]:
# We recommend doing this analysis project by project.
repo = 'pytorch'

# We include a copy of the processed issues (text cleaned for tf-idf analysis)
# For reference on how we cleaned data, go to: text_cleaning.ipynb
df_processed = pd.read_pickle(f"processed_issues/{repo}_processed.pkl")
df_processed['description_tokens'] = df_processed['description_tokens'].apply(reduce_tokens)
for idx, row in df_processed.iterrows():
    text, comments = reduce_tokens_with_comments(row['description_tokens'], row['comments_tokens'])
    df_processed.at[idx, 'complete_text'] = text + " " + comments 


Reducing tokens from 8788 to 8190
Reducing tokens from 15446 to 8190
Reducing tokens from 16490 to 8190
Reducing tokens from 9172 to 8190
Reducing tokens from 15766 to 8190
Reducing tokens from 14999 to 8190
Reducing tokens from 12524 to 8190
Reducing tokens from 8521 to 8190
Reducing tokens from 8481 to 8190
Reducing tokens from 8218 to 8190
Reducing tokens from 11361 to 8190
Reducing tokens from 9732 to 8190
Reducing tokens from 12119 to 8190
Reducing tokens from 11304 to 8190
Reducing tokens from 9011 to 8190
Reducing tokens from 8882 to 8190
Reducing tokens from 10599 to 8190
Reducing tokens from 10723 to 8190
Reducing tokens from 19644 to 8190
Reducing tokens from 8912 to 8190
Reducing tokens from 8690 to 8190
Reducing tokens from 10832 to 8190
Reducing tokens from 8238 to 8190
Reducing tokens from 18864 to 8190
Reducing tokens from 9857 to 8190
Reducing tokens from 11522 to 8190
Reducing tokens from 15850 to 8190
Reducing tokens from 11997 to 8190
Reducing tokens from 12574 to 81

In [None]:
repo = 'ComfyUI'
df_processed = pd.read_pickle(f"processed_issues/{repo}_processed.pkl")

if os.path.exists(f"issues_embeddings/{repo}_embeddings.pkl"):
    df_with_embeddings = pd.read_pickle(f"issues_embeddings/{repo}_embeddings.pkl")
else:
    df_with_embeddings = embed_issues(df_processed)
    df_with_embeddings.to_pickle(f"issues_embeddings/{repo}_embeddings.pkl")

In [603]:
# First Similarity Score: TF-IDF (using cosine similarity)
issues = df_processed['complete_text'].tolist()
tfidf_vectorizer = TfidfVectorizer(max_features=10000, tokenizer=lambda x: x.split())
tfidf_matrix = tfidf_vectorizer.fit_transform(issues)
tfidf_sim_matrix = cosine_similarity(tfidf_matrix)
tfidf_sim_matrix.shape



(64, 64)

In [604]:
# Second Similarity Score: Embeddings (using cosine similarity)
embeddings = df_with_embeddings['embedding'].tolist()
embeddings_matrix = np.array(embeddings)
embeddings_sim_matrix = cosine_similarity(embeddings_matrix)
embeddings_sim_matrix.shape

(64, 64)

In [605]:
# Combined Similarity Score: S1 + S2 --> Take only the 3 nearest neighbors per issue
combined_sim_matrix = tfidf_sim_matrix + embeddings_sim_matrix

In [None]:
pairs_to_analyze = analyze_neighbors(combined_sim_matrix, 3, df_with_embeddings, tfidf_sim_matrix, embeddings_sim_matrix)
pairs = erase_pair_duplicates(normalize_neighbors(pairs_to_analyze))
df_pairs = pd.DataFrame(pairs, columns=['issue_1', 'issue_2', 'similarity_score', 'tf_idf_score', 'embedding_score'])
df_pairs = df_pairs.sort_values(by='similarity_score', ascending=False)
df_pairs.to_csv(f"similar_issues/{repo}.csv", index=False)

Similarity Scores for Issues can be found in the data downloaded folder: RQ1/similar_issues/

Similarity Score Calculation for Pull Requests

In [None]:
prs_pkl = glob.glob(os.path.join('./prs_per_project/', '*.pkl'))
prs_df = pd.concat([pd.read_pickle(file) for file in prs_pkl], ignore_index=True)
# We control the length of the diffs for the model selected. 
encoding = tiktoken.encoding_for_model("text-embedding-3-large")
prs_df['diff_len'] = prs_df['diff'].apply(lambda x: len(encoding.encode(x, disallowed_special=())))
# We discard longer diffs
df_with_embeddings = prs_df[prs_df['diff_len'] <= 8191]
# Extract embeddings, we recommend to extract by parts
for idx, row in df_ready.iterrows():
    df_with_embeddings.at[idx, 'diff_embedding'] = get_embedding(row['diff'])

# All prs with embeddings can be found in RQ2/embeddings_fixes/

# Calculate nearest neighbor

X = np.vstack(df_with_embeddings["diff_embedding"].values)
knn = NearestNeighbors(
    n_neighbors=2,  
    metric="cosine"
)

knn.fit(X)
distances, indices = knn.kneighbors(X)

rows = []

for i, (dists, idxs) in enumerate(zip(distances, indices)):
    dist = dists[1]
    j = idxs[1]
    rows.append({
        "source_url": df_with_embeddings.iloc[i]["url"],
        "nearest_url": df_with_embeddings.iloc[j]["url"],
        "cosine_similarity": 1 - dist
    })

nn_df = pd.DataFrame(rows)

nn_df[["source_url", "nearest_url"]] = nn_df.apply(
    lambda r: sorted([r["source_url"], r["nearest_url"]]),
    axis=1,
    result_type="expand"
)

nn_df = (
    nn_df
    .groupby(["source_url", "nearest_url"], as_index=False)
    ["cosine_similarity"]
    .max()
)

nn_df = nn_df.sort_values(
    by="cosine_similarity",
    ascending=False
)

repo = file.split('/')[2].split('_')[0]
nn_df.to_csv(f'./similar_fixes/{repo}.csv')
    

Unnamed: 0,number,title,url,bodyText,mergedAt,owner,repo,diff,oid,message,committedDate,diff_len,diff_embedding
0,6357.0,Document get_attr and get_model_object,https://github.com/comfyanonymous/ComfyUI/pull...,Resolves #6303\nThis PR adds documentation on ...,2025-01-07T01:12:22Z,comfyanonymous,ComfyUI,diff --git a/comfy/model_patcher.py b/comfy/mo...,,,,,
1,6335.0,Update web content to release v1.6.16,https://github.com/comfyanonymous/ComfyUI/pull...,Resolves #6333\nCherry-pick following PRs to 1...,2025-01-03T18:56:47Z,comfyanonymous,ComfyUI,diff --git a/web/assets/BaseViewTemplate-Bklhd...,,,,,
2,6300.0,Convert `latents_ubyte` to 8-bit unsigned int ...,https://github.com/comfyanonymous/ComfyUI/pull...,"Hello, not sure if this is the best solution, ...",2025-01-28T13:22:54Z,comfyanonymous,ComfyUI,diff --git a/latent_preview.py b/latent_previe...,,,,,
3,7244.0,Update frontend to 1.12.14,https://github.com/comfyanonymous/ComfyUI/pull...,Cherry pick Comfy-Org/ComfyUI_frontend#3065\nd...,2025-03-15T05:38:10Z,comfyanonymous,ComfyUI,diff --git a/requirements.txt b/requirements.t...,,,,,
4,7143.0,Fix LoadImageOutput node,https://github.com/comfyanonymous/ComfyUI/pull...,The frontend now annotates uploaded image file...,2025-03-11T08:30:25Z,comfyanonymous,ComfyUI,diff --git a/nodes.py b/nodes.py\nindex bbf499...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18231,48216.0,[v1.7.1] [complex] torch.sqrt: fix edge values...,https://github.com/pytorch/pytorch/pull/48216,Summary:\nFixes #47358\nReplace the optimized ...,2020-11-19T17:25:00Z,pytorch,pytorch,diff --git a/aten/src/ATen/cpu/vec256/vec256_c...,,,,,
18232,48215.0,[v1.7.1] Make sure valid ParameterList/Dict do...,https://github.com/pytorch/pytorch/pull/48215,Summary:\nFixes #46983\nPull Request resolved:...,2020-11-19T17:24:08Z,pytorch,pytorch,diff --git a/test/test_nn.py b/test/test_nn.py...,,,,,
18233,48936.0,Disable autocast cache for tensor views as fix...,https://github.com/pytorch/pytorch/pull/48936,Summary:\nFixes #48049\nRoot cause of the issu...,2020-12-07T19:28:38Z,pytorch,pytorch,diff --git a/aten/src/ATen/autocast_mode.cpp b...,,,,,
18234,48768.0,[1.7.1] torch: Stop using _nt_quote_args from ...,https://github.com/pytorch/pytorch/pull/48768,Summary:\nThey removed the specific function i...,2020-12-03T04:41:56Z,pytorch,pytorch,diff --git a/torch/utils/cpp_extension.py b/to...,,,,,


Similarity Scores for Pull Requests can be found in the data downloaded folder: RQ2/similar_fixes/