In [1]:
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import torch
import csv
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
eclipse_url = "https://github.com/logpai/bughub/raw/refs/heads/master/EclipsePlatform/"
firefox_url = "https://raw.githubusercontent.com/logpai/bughub/refs/heads/master/Firefox/"
jdt_url = "https://raw.githubusercontent.com/logpai/bughub/refs/heads/master/JDT/"
mozilla_url = "https://github.com/logpai/bughub/raw/refs/heads/master/MozillaCore/"
thunderbird_url = "https://raw.githubusercontent.com/logpai/bughub/refs/heads/master/Thunderbird/"

def load_from_bughub(url, bugs):
    dataset = load_dataset("csv", data_files = url + bugs)
    train = load_dataset("csv", data_files = url + "train.csv")
    test = load_dataset("csv", data_files = url + "test.csv")
    return dataset, train, test


eclipse_dataset, eclipse_train, eclipse_test = load_from_bughub(eclipse_url, "eclipse_platform.zip")
firefox_dataset, firefox_train, firefox_test = load_from_bughub(firefox_url, "mozilla_firefox.zip")
jdt_dataset, jdt_train, jdt_test = load_from_bughub(jdt_url,"eclipse_jdt.csv")
mozilla_dataset, mozilla_train, mozilla_test = load_from_bughub(mozilla_url, "mozilla_core.zip")
thunderbird_dataset, thunderbird_train, thunderbird_test = load_from_bughub(thunderbird_url, "mozilla_thunderbird.csv")

In [6]:
def create_datasets(dataset, train, test):
    dataset_df = dataset['train'].to_pandas()
    train_df = train['train'].to_pandas()
    test_df = test['train'].to_pandas()    
    train_data = train_df.merge(dataset_df, on="Issue_id", how="left")
    train_data = pd.DataFrame(train_data).values.tolist()
    test_data = test_df.merge(dataset_df, on="Issue_id", how="left")
    test_data = pd.DataFrame(test_data).values.tolist()
    issue_dict = {issue["Issue_id"]: issue for issue in dataset["train"].to_list()}
    return train_data, test_data, issue_dict

eclipse_train, eclipse_test, eclipse_dict = create_datasets(eclipse_dataset, eclipse_train, eclipse_test)
firefox_train, firefox_test, firefox_dict = create_datasets(firefox_dataset, firefox_train, firefox_test)
jdt_train, jdt_test, jdt_dict = create_datasets(jdt_dataset, jdt_train, jdt_test)
mozilla_train, mozilla_test, mozilla_dict = create_datasets(mozilla_dataset, mozilla_train, mozilla_test)
thunderbird_train, thunderbird_test, thunderbird_dict = create_datasets(thunderbird_dataset, thunderbird_train, thunderbird_test)

In [4]:


# 保存 eclipse 数据集
pd.DataFrame(eclipse_train).to_csv('eclipse_train.csv', index=False)
pd.DataFrame(eclipse_test).to_csv('eclipse_test.csv', index=False)
pd.DataFrame.from_dict(eclipse_dict, orient='index').to_csv('eclipse_dict.csv', index=False)

# 保存 firefox 数据集
pd.DataFrame(firefox_train).to_csv('firefox_train.csv', index=False)
pd.DataFrame(firefox_test).to_csv('firefox_test.csv', index=False)
pd.DataFrame.from_dict(firefox_dict, orient='index').to_csv('firefox_dict.csv', index=False)

# 保存 jdt 数据集
pd.DataFrame(jdt_train).to_csv('jdt_train.csv', index=False)
pd.DataFrame(jdt_test).to_csv('jdt_test.csv', index=False)
pd.DataFrame.from_dict(jdt_dict, orient='index').to_csv('jdt_dict.csv', index=False)

# 保存 mozilla 数据集
pd.DataFrame(mozilla_train).to_csv('mozilla_train.csv', index=False)
pd.DataFrame(mozilla_test).to_csv('mozilla_test.csv', index=False)
pd.DataFrame.from_dict(mozilla_dict, orient='index').to_csv('mozilla_dict.csv', index=False)

# 保存 thunderbird 数据集
pd.DataFrame(thunderbird_train).to_csv('thunderbird_train.csv', index=False)
pd.DataFrame(thunderbird_test).to_csv('thunderbird_test.csv', index=False)
pd.DataFrame.from_dict(thunderbird_dict, orient='index').to_csv('thunderbird_dict.csv', index=False)

In [7]:
import random
from sentence_transformers import InputExample


SEED = 69  # Change this number for different results

def set_seed(seed):
    random.seed(seed)  # Python's random module
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU (if available)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic optimizations


def create_pairs(dataset, issue_dict):
    pairs = []
    for entry in dataset:
        issue_id = entry[0] 
        duplicates = entry[1]  # List of duplicate issue IDs
        
        if duplicates:
            duplicates = entry[1].split(";")
            for dup_id in duplicates:
                if int(dup_id) in issue_dict:
                    dup_id = int(dup_id)
                    bug_1 = (issue_dict[issue_id]["Component"] or "") + " " + (issue_dict[issue_id]["Title"] or "") + " " + (issue_dict[issue_id]["Description"] or "")
                    bug_2 = (issue_dict[dup_id]["Component"] or "") + " " + (issue_dict[dup_id]["Title"] or "") + " " + (issue_dict[dup_id]["Description"] or "")
                    pairs.append(InputExample(
                        texts = [bug_1, bug_2],
                        label = 1
                    ))

            # Create negative samples (random non-duplicate issues)
            for _ in range(len(duplicates)):  # Create equal number of negative pairs
                random_id = random.choice(list(issue_dict.keys()))
                if str(random_id) not in duplicates and random_id != issue_id:
                    bug_1 = (issue_dict[issue_id]["Component"] or "") + " " + (issue_dict[issue_id]["Title"] or "") + " " + (issue_dict[issue_id]["Description"] or "")
                    bug_2 = (issue_dict[random_id]["Component"] or "") + " " + (issue_dict[random_id]["Title"] or "") + " " + (issue_dict[random_id]["Description"] or "")
                    pairs.append(InputExample(
                        texts = [bug_1, bug_2],
                        label = 0
                    ))
    
    return pairs

from concurrent.futures import ThreadPoolExecutor

datasets = [
    ("eclipse_train", eclipse_train, eclipse_dict),
    ("eclipse_test", eclipse_test, eclipse_dict),
    ("firefox_train", firefox_train, firefox_dict),
    ("firefox_test", firefox_test, firefox_dict),
    ("jdt_train", jdt_train, jdt_dict),
    ("jdt_test", jdt_test, jdt_dict),
    ("mozilla_train", mozilla_train, mozilla_dict),
    ("mozilla_test", mozilla_test, mozilla_dict),
    ("thunderbird_train", thunderbird_train, thunderbird_dict),
    ("thunderbird_test", thunderbird_test, thunderbird_dict),
]

results = {}

def process_dataset(name, data, dictionary):
    return name, create_pairs(data, dictionary)

with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_dataset, name, data, dictionary): name for name, data, dictionary in datasets}
    
    for future in futures:
        name, result = future.result()
        results[name] = result

# Assign results to variables dynamically
eclipse_train_samples = results["eclipse_train"]
eclipse_test_samples = results["eclipse_test"]
firefox_train_samples = results["firefox_train"]
firefox_test_samples = results["firefox_test"]
jdt_train_samples = results["jdt_train"]
jdt_test_samples = results["jdt_test"]
mozilla_train_samples = results["mozilla_train"]
mozilla_test_samples = results["mozilla_test"]
thunderbird_train_samples = results["thunderbird_train"]
thunderbird_test_samples = results["thunderbird_test"]

train_samples = eclipse_train_samples + firefox_train_samples + jdt_train_samples + mozilla_train_samples + thunderbird_train_samples
test_samples =  eclipse_test_samples + firefox_test_samples + jdt_test_samples + mozilla_test_samples + thunderbird_test_samples

In [2]:
import random
from sentence_transformers import InputExample


SEED = 69  # Change this number for different results

def set_seed(seed):
    random.seed(seed)  # Python's random module
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU (if available)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic optimizations


In [6]:
def pairs_to_dataframe(pairs):
    data = []
    for pair in pairs:
        # pair.texts 是一个包含两个文本的列表，pair.label 是标签
        data.append({
            "text1": pair.texts[0],
            "text2": pair.texts[1],
            "label": pair.label
        })
    return pd.DataFrame(data)

# 转换训练和测试数据为 DataFrame
df_train = pairs_to_dataframe(train_samples)
df_test = pairs_to_dataframe(test_samples)

# 保存为 CSV 文件到当前文件夹
df_train.to_csv("train_pairs.csv", index=False, encoding="utf-8")
df_test.to_csv("test_pairs.csv", index=False, encoding="utf-8")

In [3]:
from torch.utils.data import Dataset, DataLoader
class PairDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file, encoding="utf-8")
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        return InputExample(texts=[row["text1"], row["text2"]], label=float(row["label"]))

# 加载 CSV 数据集
train_dataset = PairDataset("train_pairs.csv")
test_dataset = PairDataset("test_pairs.csv")

In [4]:
from torch.utils.data import DataLoader
from sentence_transformers import losses

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)  # multiple GPU
model.to(device)

train_loss = losses.CosineSimilarityLoss(model)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128, num_workers=3, pin_memory=True, prefetch_factor=2, worker_init_fn=lambda _: set_seed(SEED))

num_epochs = 2
warmup_steps = int(0.05 * len(train_dataloader) * num_epochs)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    use_amp=True,
    show_progress_bar=True,
    optimizer_params={"lr": 2e-5, "weight_decay": 1.00}  # Adjust these values
)

                                                                     

Step,Training Loss
500,0.094
1000,0.0719
1500,0.0619
2000,0.0554
2500,0.0518
3000,0.0492
3500,0.0475
4000,0.0451
4500,0.0432
5000,0.0416


In [10]:
from sentence_transformers import util
import torch

# Function to evaluate test samples
def evaluate_model(model, test_samples):
    TP, FP, FN = 0, 0, 0
    total = len(test_samples)

    for sample in test_dataset:
        text1, text2 = sample.texts
        label = sample.label  # Ground truth (1 for duplicate, 0 for non-duplicate)

        # Compute embeddings
        emb1 = model.encode(text1, convert_to_tensor=True)
        emb2 = model.encode(text2, convert_to_tensor=True)

        # Compute cosine similarity
        similarity = util.pytorch_cos_sim(emb1, emb2).item()

        # Threshold (adjust based on your dataset)
        prediction = 1 if similarity > 0.5 else 0

        # Compare with ground truth
        # Count TP, FP, FN
        if prediction == 1 and label == 1:
            TP += 1  # True Positive
        elif prediction == 1 and label == 0:
            FP += 1  # False Positive
        elif prediction == 0 and label == 1:
            FN += 1  # False Negative


    # Compute metrics
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    f2_score = (5 * precision * recall) / ((4 * precision) + recall) if (precision + recall) > 0 else 0 
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1_score:.4f}")
    print(f"F2-score: {f2_score:.4f}") 

# Run evaluation
evaluate_model(model, test_dataset)

Precision: 0.8861
Recall: 0.5748
F1-score: 0.6972
F2-score: 0.6182


In [5]:
from sentence_transformers import util
import torch
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import (
    roc_curve, auc,
    precision_recall_curve,
    confusion_matrix,
    ConfusionMatrixDisplay
)

def evaluate_model(model, test_samples):
    
    all_similarities = []
    all_labels = []
    
   
    for sample in test_samples:
        text1, text2 = sample.texts
        label = sample.label
        
       
        emb1 = model.encode(text1, convert_to_tensor=True)
        emb2 = model.encode(text2, convert_to_tensor=True)
        
       
        similarity = util.pytorch_cos_sim(emb1, emb2).item()
        
        all_similarities.append(similarity)
        all_labels.append(label)

    
    similarities = np.array(all_similarities)
    labels = np.array(all_labels)

    
    # 1. ROC and AUC
    fpr, tpr, thresholds_roc = roc_curve(labels, similarities)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig('roc_curve.png')
    plt.close()

    # 2. Precision-recall curve
    precision, recall, thresholds_pr = precision_recall_curve(labels, similarities)
    avg_precision = auc(recall, precision)
    
    plt.figure(figsize=(10, 6))
    plt.plot(recall, precision, color='blue', lw=2, label=f'AP={avg_precision:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="upper right")
    plt.savefig('pr_curve.png')
    plt.close()

    # 3. Threshold evaluation
    thresholds = np.linspace(0, 1, 100)
    f1_scores = []
    f2_scores = []
    precisions = []
    recalls = []
    accuracies = []
    
    for thresh in thresholds:
        preds = (similarities > thresh).astype(int)
        tp = np.sum((preds == 1) & (labels == 1))
        fp = np.sum((preds == 1) & (labels == 0))
        fn = np.sum((preds == 0) & (labels == 1))
        tn = np.sum((preds == 0) & (labels == 0))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f2 = (5 * precision * recall) / (4 * precision + recall) if (precision + recall) > 0 else 0
        accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0
        
        f1_scores.append(f1)
        f2_scores.append(f2)
        precisions.append(precision)
        recalls.append(recall)
        accuracies.append(accuracy)

    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, f1_scores, label='F1 Score')
    plt.plot(thresholds, f2_scores, label='F2 Score')
    plt.plot(thresholds, precisions, label='Precision')
    plt.plot(thresholds, recalls, label='Recall')
    plt.plot(thresholds, accuracies, label='Accuracy')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.title('Metrics vs Threshold')
    plt.legend()
    plt.grid()
    plt.savefig('metrics_vs_threshold.png')
    plt.close()

    # 4. Confusion matrix
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]
    preds = (similarities > best_thresh).astype(int)
    
    cm = confusion_matrix(labels, preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Non-Duplicate', 'Duplicate'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix (Threshold={best_thresh:.2f})')
    plt.savefig('confusion_matrix.png')
    plt.close()

    # 5. Similarity distribution
    plt.figure(figsize=(10, 6))
    plt.hist(similarities[labels == 1], bins=30, alpha=0.5, label='Duplicates', color='g')
    plt.hist(similarities[labels == 0], bins=30, alpha=0.5, label='Non-Duplicates', color='r')
    plt.xlabel('Similarity Score')
    plt.ylabel('Count')
    plt.title('Similarity Score Distribution')
    plt.legend()
    plt.savefig('similarity_distribution.png')
    plt.close()

    # Print metrics
    print(f"\nBest Threshold: {best_thresh:.4f}")
    print(f"Precision: {precisions[best_idx]:.4f}")
    print(f"Recall: {recalls[best_idx]:.4f}")
    print(f"F1-score: {f1_scores[best_idx]:.4f}")
    print(f"F2-score: {f2_scores[best_idx]:.4f}")
    print(f"Accuracy: {accuracies[best_idx]:.4f}")

evaluate_model(model, test_dataset)



Best Threshold: 0.0606
Precision: 0.7669
Recall: 0.7584
F1-score: 0.7626
F2-score: 0.7601
Accuracy: 0.7637
