# J001 Assignment 5

Comparing bi-encoder and cross-encoder approaches for question pair similarity detection on Quora dataset.

## Dependencies

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sentence_transformers import (
    SentenceTransformer, 
    util, 
    InputExample, 
    losses, 
    CrossEncoder
)
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os

2025-09-07 05:46:59.994453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757224020.161745      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757224020.212270      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data Preparation

In [2]:
!rm train.csv
!unzip /kaggle/input/quora-question-pairs/train.csv.zip

rm: cannot remove 'train.csv': No such file or directory
Archive:  /kaggle/input/quora-question-pairs/train.csv.zip
  inflating: train.csv               


In [3]:
data = pd.read_csv("train.csv")

SUBSET_SIZE = 50000
stratified_subset, _ = train_test_split(
    data, 
    train_size=SUBSET_SIZE, 
    stratify=data["is_duplicate"], 
    random_state=42
)

train_data, temp_data = train_test_split(
    stratified_subset, 
    test_size=0.2, 
    stratify=stratified_subset["is_duplicate"], 
    random_state=42
)
validation_data, test_data = train_test_split(
    temp_data, 
    test_size=0.5, 
    stratify=temp_data["is_duplicate"], 
    random_state=42
)

splits_dir = "splits"
os.makedirs(splits_dir, exist_ok=True)

datasets = {
    "train": train_data,
    "valid": validation_data, 
    "test": test_data
}

for name, dataset in datasets.items():
    dataset.to_csv(f"{splits_dir}/{name}.csv", index=False)

print("Dataset sizes:", {name: len(dataset) for name, dataset in datasets.items()})

Dataset sizes: {'train': 40000, 'valid': 5000, 'test': 5000}


## Baseline Model

In [4]:
test_data = pd.read_csv("splits/test.csv")

baseline_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

question1_embeddings = baseline_model.encode(
    test_data["question1"].tolist(), 
    batch_size=128, 
    convert_to_numpy=True
)
question2_embeddings = baseline_model.encode(
    test_data["question2"].tolist(), 
    batch_size=128, 
    convert_to_numpy=True
)

cosine_similarities = util.cos_sim(question1_embeddings, question2_embeddings).diagonal().cpu().numpy()

optimal_f1, optimal_threshold = 0, 0
threshold_range = np.arange(-1.0, 1.01, 0.01)

for threshold in threshold_range:
    predictions = (cosine_similarities >= threshold).astype(int)
    current_f1 = f1_score(test_data["is_duplicate"], predictions)
    if current_f1 > optimal_f1:
        optimal_f1, optimal_threshold = current_f1, threshold

print(f"[Baseline] Test F1={optimal_f1:.4f} at threshold={optimal_threshold:.2f}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

[Baseline] Test F1=0.7345 at threshold=0.75


## Bi-Encoder Models

Training three models with different loss functions:
- **CosineSimilarityLoss**: all-MiniLM-L6-v2  
- **OnlineContrastiveLoss**: paraphrase-MiniLM-L6-v2
- **MultipleNegativesRankingLoss**: all-mpnet-base-v2

In [5]:
os.environ['WANDB_DISABLED'] = 'true'

def create_input_examples(dataframe):
    examples = []
    for _, row in dataframe.iterrows():
        example = InputExample(
            texts=[row["question1"], row["question2"]], 
            label=float(row["is_duplicate"])
        )
        examples.append(example)
    return examples

def find_optimal_threshold(similarities, true_labels):
    best_score, best_thresh = 0.0, 0.0
    for thresh in np.arange(-1.0, 1.01, 0.01):
        predictions = (similarities >= thresh).astype(int)
        score = f1_score(true_labels, predictions)
        if score > best_score:
            best_score, best_thresh = score, thresh
    return best_score, best_thresh

def evaluate_model_performance(model, examples, loss_name):
    texts_1 = [ex.texts[0] for ex in examples]
    texts_2 = [ex.texts[1] for ex in examples]
    true_labels = [int(ex.label) if loss_name != "cos" else (1 if ex.label == 1.0 else 0) for ex in examples]
    
    embeddings_1 = model.encode(texts_1, batch_size=128, convert_to_numpy=True)
    embeddings_2 = model.encode(texts_2, batch_size=128, convert_to_numpy=True)
    similarities = util.cos_sim(embeddings_1, embeddings_2).diagonal().cpu().numpy()
    
    return find_optimal_threshold(similarities, true_labels)

def train_biencoder_model(loss_name, model_name, training_examples, validation_examples, test_examples):
    encoder_model = SentenceTransformer(model_name)
    
    loss_functions = {
        "mnr": lambda: losses.MultipleNegativesRankingLoss(encoder_model),
        "cos": lambda: losses.CosineSimilarityLoss(encoder_model),
        "contrastive": lambda: losses.OnlineContrastiveLoss(
            encoder_model,
            distance_metric=losses.SiameseDistanceMetric.COSINE_DISTANCE,
            margin=0.5
        )
    }
    
    if loss_name == "cos":
        for example in training_examples:
            example.label = 1.0 if example.label == 1.0 else -1.0
    
    loss_function = loss_functions[loss_name]()
    dataloader = DataLoader(training_examples, shuffle=True, batch_size=32)
    
    encoder_model.fit(
        train_objectives=[(dataloader, loss_function)],
        epochs=2,
        warmup_steps=100,
        optimizer_params={'lr': 2e-5},
        show_progress_bar=True
    )
    
    val_f1, threshold = evaluate_model_performance(encoder_model, validation_examples, loss_name)
    test_f1, _ = evaluate_model_performance(encoder_model, test_examples, loss_name)
    
    print(f"[{loss_name}] Validation F1={val_f1:.4f} | Test F1={test_f1:.4f} at threshold={threshold:.2f}")
    return encoder_model, test_f1

datasets = {}
for split in ["train", "valid", "test"]:
    datasets[split] = pd.read_csv(f"splits/{split}.csv")

training_examples = create_input_examples(datasets["train"])
validation_examples = create_input_examples(datasets["valid"])
testing_examples = create_input_examples(datasets["test"])

model_configs = [
    ("cos", "sentence-transformers/all-MiniLM-L6-v2"),
    ("contrastive", "sentence-transformers/paraphrase-MiniLM-L6-v2"),
    ("mnr", "sentence-transformers/all-mpnet-base-v2")
]

trained_models = {}
for loss_type, base_model in model_configs:
    model, model_f1_score = train_biencoder_model(
        loss_type, base_model, training_examples, validation_examples, testing_examples
    )
    trained_models[loss_type] = (model, model_f1_score)

cos_model, cos_f1 = trained_models["cos"]
contrast_model, contrast_f1 = trained_models["contrastive"] 
mnr_model, mnr_f1 = trained_models["mnr"]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,1.1303
1000,1.0026
1500,0.9771
2000,0.9612
2500,0.9503


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

[cos] Validation F1=0.6987 | Test F1=0.6831 at threshold=0.23


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.054
1000,0.0003
1500,0.0002
2000,0.0002
2500,0.0002


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

[contrastive] Validation F1=0.5397 | Test F1=0.5400 at threshold=0.99


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.4067
1000,0.3476
1500,0.2855
2000,0.248
2500,0.248


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

[mnr] Validation F1=0.7207 | Test F1=0.7163 at threshold=0.75


## Cross-Encoder Model

In [6]:
cross_encoder_train_examples = [
    InputExample(texts=[row["question1"], row["question2"]], label=float(row["is_duplicate"]))
    for _, row in datasets["train"].iterrows()
]

cross_encoder_val_samples = [
    (row["question1"], row["question2"], int(row["is_duplicate"]))
    for _, row in datasets["valid"].iterrows()
]

cross_encoder_test_samples = [
    (row["question1"], row["question2"], int(row["is_duplicate"]))
    for _, row in datasets["test"].iterrows()
]

train_loader = DataLoader(cross_encoder_train_examples, shuffle=True, batch_size=16)

cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", num_labels=1)

cross_encoder_model.fit(
    train_dataloader=train_loader,
    epochs=1,
    warmup_steps=100,
    show_progress_bar=True
)

def evaluate_cross_encoder(model, sample_pairs):
    question_pairs = [(q1, q2) for q1, q2, _ in sample_pairs]
    ground_truth_labels = [label for _, _, label in sample_pairs]

    prediction_scores = model.predict(question_pairs)

    best_f1_score, best_threshold = 0, 0
    for threshold in np.linspace(0, 1, 101):
        binary_predictions = (prediction_scores >= threshold).astype(int)
        current_f1 = f1_score(ground_truth_labels, binary_predictions)
        if current_f1 > best_f1_score:
            best_f1_score, best_threshold = current_f1, threshold
    return best_f1_score, best_threshold

validation_f1, optimal_thresh = evaluate_cross_encoder(cross_encoder_model, cross_encoder_val_samples)
test_f1_final, _ = evaluate_cross_encoder(cross_encoder_model, cross_encoder_test_samples)

print(f"[CrossEncoder] Validation F1={validation_f1:.4f} | Test F1={test_f1_final:.4f} at threshold={optimal_thresh:.2f}")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,0.4933
1000,0.3976
1500,0.3752
2000,0.3807
2500,0.3651


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

[CrossEncoder] Validation F1=0.8042 | Test F1=0.7962 at threshold=0.23


## Model Performance Comparison

In [7]:
performance_results = {
    "Baseline Model": optimal_f1,
    "Cosine Similarity": cos_f1,
    "Contrastive Learning": contrast_f1,
    "Multiple Negatives Ranking": mnr_f1,
    "Cross-Encoder": test_f1_final
}

for model_name, score in performance_results.items():
    print(f"{model_name}: {score:.4f}" if isinstance(score, float) else f"{model_name}: {score}")

print(f"\nBest performing model: {max(performance_results, key=lambda k: performance_results[k] if isinstance(performance_results[k], float) else 0)}")

Baseline Model: 0.7345
Cosine Similarity: 0.6831
Contrastive Learning: 0.5400
Multiple Negatives Ranking: 0.7163
Cross-Encoder: 0.7962

Best performing model: Cross-Encoder
