In [1]:
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import torch
import csv
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import random
from sentence_transformers import InputExample


SEED = 69  # Change this number for different results

def set_seed(seed):
    random.seed(seed)  # Python's random module
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU (if available)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic optimizations


In [3]:
from torch.utils.data import Dataset, DataLoader
class PairDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file, encoding="utf-8")
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        return InputExample(texts=[row["text1"], row["text2"]], label=float(row["label"]))

# 加载 CSV 数据集
train_dataset = PairDataset("train_pairs.csv")
test_dataset = PairDataset("test_pairs.csv")

In [6]:
#ADAPTER TRAINING

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
import adapters

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128, num_workers=3, pin_memory=True, prefetch_factor=2, worker_init_fn=lambda _: set_seed(SEED))
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Add an adapter for fine-tuning
adapter_name = "pfeiffer_adapter"

# Get the underlying Hugging Face transformer model
transformer_model = model._first_module().auto_model 

# Add and activate adapter
adapters.init(transformer_model)
transformer_model.add_adapter(adapter_name, config="pfeiffer", set_active=True)
transformer_model.train_adapter(adapter_name)

train_loss = losses.CosineSimilarityLoss(model)
num_epochs = 2
warmup_steps = int(0.05 * len(train_dataloader) * num_epochs)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    use_amp=True,
    show_progress_bar=True,
    optimizer_params={"lr": 2.5e-5, "weight_decay": 0.95}  # Learning rate and weight decay
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
                                                                     

Step,Training Loss
500,0.1234
1000,0.1056
1500,0.0914
2000,0.0871
2500,0.0842
3000,0.0815
3500,0.0801
4000,0.0785
4500,0.0787
5000,0.0774


In [7]:
from sentence_transformers import util
import torch
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import (
    roc_curve, auc,
    precision_recall_curve,
    confusion_matrix,
    ConfusionMatrixDisplay
)

def evaluate_model(model, test_samples):
    
    all_similarities = []
    all_labels = []
    
   
    for sample in test_samples:
        text1, text2 = sample.texts
        label = sample.label
        
       
        emb1 = model.encode(text1, convert_to_tensor=True)
        emb2 = model.encode(text2, convert_to_tensor=True)
        
       
        similarity = util.pytorch_cos_sim(emb1, emb2).item()
        
        all_similarities.append(similarity)
        all_labels.append(label)

    
    similarities = np.array(all_similarities)
    labels = np.array(all_labels)

    
    # 1. ROC and AUC
    fpr, tpr, thresholds_roc = roc_curve(labels, similarities)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig('roc_curve.png')
    plt.close()

    # 2. Precision-recall curve
    precision, recall, thresholds_pr = precision_recall_curve(labels, similarities)
    avg_precision = auc(recall, precision)
    
    plt.figure(figsize=(10, 6))
    plt.plot(recall, precision, color='blue', lw=2, label=f'AP={avg_precision:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="upper right")
    plt.savefig('pr_curve.png')
    plt.close()

    # 3. Threshold evaluation
    thresholds = np.linspace(0, 1, 100)
    f1_scores = []
    f2_scores = []
    precisions = []
    recalls = []
    accuracies = []
    
    for thresh in thresholds:
        preds = (similarities > thresh).astype(int)
        tp = np.sum((preds == 1) & (labels == 1))
        fp = np.sum((preds == 1) & (labels == 0))
        fn = np.sum((preds == 0) & (labels == 1))
        tn = np.sum((preds == 0) & (labels == 0))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f2 = (5 * precision * recall) / (4 * precision + recall) if (precision + recall) > 0 else 0
        accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0
        
        f1_scores.append(f1)
        f2_scores.append(f2)
        precisions.append(precision)
        recalls.append(recall)
        accuracies.append(accuracy)

    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, f1_scores, label='F1 Score')
    plt.plot(thresholds, f2_scores, label='F2 Score')
    plt.plot(thresholds, precisions, label='Precision')
    plt.plot(thresholds, recalls, label='Recall')
    plt.plot(thresholds, accuracies, label='Accuracy')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.title('Metrics vs Threshold')
    plt.legend()
    plt.grid()
    plt.savefig('metrics_vs_threshold.png')
    plt.close()

    # 4. Confusion matrix
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]
    preds = (similarities > best_thresh).astype(int)
    
    cm = confusion_matrix(labels, preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Non-Duplicate', 'Duplicate'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix (Threshold={best_thresh:.2f})')
    plt.savefig('confusion_matrix.png')
    plt.close()

    # 5. Similarity distribution
    plt.figure(figsize=(10, 6))
    plt.hist(similarities[labels == 1], bins=30, alpha=0.5, label='Duplicates', color='g')
    plt.hist(similarities[labels == 0], bins=30, alpha=0.5, label='Non-Duplicates', color='r')
    plt.xlabel('Similarity Score')
    plt.ylabel('Count')
    plt.title('Similarity Score Distribution')
    plt.legend()
    plt.savefig('similarity_distribution.png')
    plt.close()

    # Print metrics
    print(f"\nBest Threshold: {best_thresh:.4f}")
    print(f"Precision: {precisions[best_idx]:.4f}")
    print(f"Recall: {recalls[best_idx]:.4f}")
    print(f"F1-score: {f1_scores[best_idx]:.4f}")
    print(f"F2-score: {f2_scores[best_idx]:.4f}")
    print(f"Accuracy: {accuracies[best_idx]:.4f}")

evaluate_model(model, test_dataset)



Best Threshold: 0.0000
Precision: 0.5604
Recall: 0.8844
F1-score: 0.6861
F2-score: 0.7928
Accuracy: 0.5950
