In [9]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the CSV file
file_path = 'erpnext_issues_classification.csv'  # Replace with the actual file path
data = pd.read_csv(file_path)

# Step 1: Filter issues where category is "ERP Workflow" and component is "Accounting"
filtered_issues = data[
    (data['category'] == 'ERP Workflow') &
    (data['component'] == 'Accounting')
].copy()

# Step 2: Combine 'preconditions', 'steps_to_reproduce', 'expected_results', and 'actual_results' for embedding
filtered_issues['combined_text'] = (
    filtered_issues['preconditions'].fillna('') + ' ' +
    filtered_issues['steps_to_reproduce'].fillna('') + ' ' +
    filtered_issues['expected_results'].fillna('') + ' ' +
    filtered_issues['actual_results'].fillna('')
)

# Check if there are at least 50 issues
if len(filtered_issues) < 50:
    raise ValueError("There are fewer than 50 issues meeting the criteria.")

# Step 3: Use Sentence-BERT for embeddings on all filtered issues
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, fast variant of SBERT
embeddings = model.encode(filtered_issues['combined_text'].tolist())

# Step 4: Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Step 5: Identify the top 50 most distinct issues from the entire dataset
similarity_sums = np.sum(similarity_matrix, axis=1)
most_distinct_indices = np.argsort(similarity_sums)[:50]

# Step 6: Select the top 50 distinct issues
distinct_issues = filtered_issues.iloc[most_distinct_indices]

# Step 7: Save the results to a CSV file
distinct_issues.to_csv('top_50_distinct_issues_sorted.csv', index=False)


In [53]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load the CSV file
file_path = 'erpnext_issues_classification.csv'  # Replace with the actual file path
invalid_issues_file = 'erpnext_issues_all.csv'  # File containing all issues with URLs

# Step 1: Load the list of invalid issues
all_issues_data = pd.read_csv(invalid_issues_file)
invalid_url_issues = all_issues_data[~all_issues_data['url'].str.startswith('https://github.com/frappe/erpnext/', na=False)]
invalid_issue_numbers = set(invalid_url_issues['number'])

# Step 2: Load the title data and preprocess titles
issue_titles = all_issues_data[['number', 'title']].dropna().copy()

# Preprocess titles to remove "(backport #...)"
def preprocess_title(title):
    return title.split("(backport")[0].strip()

issue_titles['title'] = issue_titles['title'].apply(preprocess_title)

# Compute title similarity
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, fast variant of SBERT
title_embeddings = model.encode(issue_titles['title'].tolist())
title_similarity_matrix = cosine_similarity(title_embeddings)

# Group issues by title similarity > 0.9
title_threshold = 0.97
title_groups = []
unused_indices = set(range(len(issue_titles)))

while unused_indices:
    current = unused_indices.pop()
    group = [current]
    for other in list(unused_indices):
        if title_similarity_matrix[current, other] > title_threshold:
            group.append(other)
            unused_indices.remove(other)
    title_groups.append(group)

# Select one issue from each title group with the longest combined_text
selected_titles = []
for group in title_groups:
    group_issues = issue_titles.iloc[group].copy()
    # Map group issues to filtered_issues by 'number'
    group_issues_filtered = filtered_issues[filtered_issues['number'].isin(group_issues['number'])].copy()
    if not group_issues_filtered.empty:
        group_issues_filtered.loc[:, 'combined_length'] = (
            group_issues_filtered['preconditions'].fillna('').apply(len) +
            group_issues_filtered['steps_to_reproduce'].fillna('').apply(len)
        )
        selected_issue = group_issues_filtered.sort_values(by='combined_length', ascending=False).iloc[0]
        selected_titles.append(selected_issue)

# Update filtered_issues with the selected titles
filtered_issues = pd.DataFrame(selected_titles)

# Step 3: Combine 'preconditions', 'steps_to_reproduce', 'expected_results', and 'actual_results' for embedding
filtered_issues['combined_text'] = (
    filtered_issues['preconditions'].fillna('') + ' ' +
    filtered_issues['steps_to_reproduce'].fillna('') + ' ' +
    filtered_issues['expected_results'].fillna('') + ' ' +
    filtered_issues['actual_results'].fillna('')
)

# Step 4: Use Sentence-BERT for embeddings on all filtered issues
embeddings = model.encode(filtered_issues['combined_text'].tolist())

# Step 5: Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Step 6: Group issues by similarity threshold
threshold = 0.90  # Define a similarity threshold
groups = []
unused_indices = set(range(len(filtered_issues)))

while unused_indices:
    current = unused_indices.pop()
    group = [current]
    for other in list(unused_indices):
        if similarity_matrix[current, other] > threshold:
            group.append(other)
            unused_indices.remove(other)
    groups.append(group)

# Step 7: Select one issue from each group with the longest combined length
selected_issues = []
for group in groups:
    group_issues = filtered_issues.iloc[group].copy()
    group_issues.loc[:, 'combined_length'] = (
        group_issues['preconditions'].fillna('').apply(len) +
        group_issues['steps_to_reproduce'].fillna('').apply(len)
    )
    selected_issue = group_issues.sort_values(by='combined_length', ascending=False).iloc[0]
    selected_issues.append(selected_issue)

# Convert the selected issues to a DataFrame
selected_issues_df = pd.DataFrame(selected_issues)

# Step 8: Save the results to a CSV file
selected_issues_df.to_csv('top_50_distinct_issues_sorted.csv', index=False)

KeyboardInterrupt: 

In [51]:
pairs = []
used_issues = set()
n = len(filtered_issues)
for i in range(n):
    if filtered_issues.iloc[i]['number'] in used_issues:
        continue
    for j in range(i + 1, n):
        if filtered_issues.iloc[j]['number'] in used_issues:
            continue
        if 0.85 <= similarity_matrix[i, j] < 0.9:
            pairs.append((
                filtered_issues.iloc[i]['number'],
                filtered_issues.iloc[j]['number'],
                filtered_issues.iloc[i]['preconditions'],
                filtered_issues.iloc[j]['preconditions'],
                filtered_issues.iloc[i]['steps_to_reproduce'],
                filtered_issues.iloc[j]['steps_to_reproduce'],
                filtered_issues.iloc[i]['expected_results'],
                filtered_issues.iloc[j]['expected_results'],
                filtered_issues.iloc[i]['actual_results'],
                filtered_issues.iloc[j]['actual_results'],
                similarity_matrix[i, j]
            ))
            used_issues.add(filtered_issues.iloc[i]['number'])
            used_issues.add(filtered_issues.iloc[j]['number'])
        if len(pairs) >= 10:
            break
    if len(pairs) >= 10:
        break           

# Create a DataFrame for the selected pairs
pairs_df = pd.DataFrame(pairs, columns=[
    'Issue1_Number', 'Issue2_Number',
    'Issue1_Preconditions', 'Issue2_Preconditions',
    'Issue1_Steps', 'Issue2_Steps',
    'Issue1_Expected', 'Issue2_Expected',
    'Issue1_Actual', 'Issue2_Actual', 'Similarity'
])

# Save the pairs to a CSV file
pairs_df.to_csv('similarity_pairs_0.85_0.9.csv', index=False)


In [42]:
title_pairs = []
used_issues = set()
n = len(issue_titles)
for i in range(n):
    if issue_titles.iloc[i]['number'] in used_issues:
        continue
    for j in range(i + 1, n):
        if issue_titles.iloc[j]['number'] in used_issues:
            continue
        if 0.95 <= title_similarity_matrix[i, j] < 0.98:
            title_pairs.append((
                issue_titles.iloc[i]['number'],
                issue_titles.iloc[j]['number'],
                issue_titles.iloc[i]['title'],
                issue_titles.iloc[j]['title'],
                title_similarity_matrix[i, j]
            ))
            used_issues.add(issue_titles.iloc[i]['number'])
            used_issues.add(issue_titles.iloc[j]['number'])
        if len(title_pairs) >= 10:
            break
    if len(title_pairs) >= 10:
        break

# Create a DataFrame for the selected title pairs
title_pairs_df = pd.DataFrame(title_pairs, columns=[
    'Issue1_Number', 'Issue2_Number', 'Issue1_Title', 'Issue2_Title', 'Similarity'
])

# Save the title pairs to a CSV file
title_pairs_df.to_csv('title_similarity_pairs_0.95_0.98.csv', index=False)

In [1]:
from scripts.analysis.issue_filter import IssueSimilarityFilter

filter_instance = IssueSimilarityFilter()
filter_instance.filter_issues('erpnext_issues_classification.csv', 'erpnext_issues_all.csv', 'filtered_issues_title.csv')

  from .autonotebook import tqdm as notebook_tqdm
