# Sentence Similarity

In [1]:
from datasets import load_dataset

ds = load_dataset("google-research-datasets/paws", "labeled_final")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 49401/49401 [00:00<00:00, 699607.70 examples/s]
Generating test split: 100%|██████████| 8000/8000 [00:00<00:00, 1438375.86 examples/s]
Generating validation split: 100%|██████████| 8000/8000 [00:00<00:00, 1399617.59 examples/s]


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})

In [3]:
import torch
import torch.nn.functional as F

In [4]:
# Download the Spacy Model
# !python -m spacy download en_core_web_md

In [5]:
# Load the spaCy model
import spacy
nlp = spacy.load("en_core_web_md")

In [6]:
# Example
sentence1 = "did run quick"
sentence2 = "swim fast"

doc1 = nlp(sentence1)
doc2 = nlp(sentence2)

def get_sentence_embedding(doc):
    word_embeddings = [token.vector for token in doc if token.has_vector]
    if word_embeddings:
        return torch.mean(torch.tensor(word_embeddings), dim=0)
    else:
        return torch.zeros(nlp.vocab.vectors_length) 

embedding_sentence1 = get_sentence_embedding(doc1)
embedding_sentence2 = get_sentence_embedding(doc2)

cosine_similarity = F.cosine_similarity(embedding_sentence1, embedding_sentence2, dim=0)

print(f"Cosine similarity between the sentences: {cosine_similarity.item():.4f}")


Cosine similarity between the sentences: 0.4713


  return torch.mean(torch.tensor(word_embeddings), dim=0)


In [8]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm

def evaluate_phrase_similarity(ds, nlp, threshold=0.5):
    def get_sentence_embedding(doc):
        word_embeddings = [token.vector for token in doc if token.has_vector]
        if word_embeddings:
            return torch.mean(torch.tensor(word_embeddings), dim=0)
        else:
            return torch.zeros(nlp.vocab.vectors_length)

    test_data = ds['test']

    true_scores = []
    predicted_scores = []

    for example in tqdm(test_data, desc="Evaluating test data"):
        doc1 = nlp(example['sentence1'])
        doc2 = nlp(example['sentence2'])

        embedding1 = get_sentence_embedding(doc1)
        embedding2 = get_sentence_embedding(doc2)

        similarity = F.cosine_similarity(embedding1, embedding2, dim=0).item()
        predicted_scores.append(similarity)

        true_scores.append(example['label'])

    # Normalize scores to 0-1 range
    true_scores = np.array(true_scores)
    predicted_scores = np.array(predicted_scores)
    true_scores_norm = (true_scores - true_scores.min()) / (true_scores.max() - true_scores.min())
    predicted_scores_norm = (predicted_scores - predicted_scores.min()) / (predicted_scores.max() - predicted_scores.min())

    # Calculate correlations
    pearson_corr, _ = pearsonr(true_scores, predicted_scores)
    spearman_corr, _ = spearmanr(true_scores, predicted_scores)
    correlation = (pearson_corr + spearman_corr) / 2

    # Convert to binary predictions
    true_binary = (true_scores_norm >= threshold).astype(int)
    predicted_binary = (predicted_scores_norm >= threshold).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(true_binary, predicted_binary)
    precision, recall, f1, _ = precision_recall_fscore_support(true_binary, predicted_binary, average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'correlation': correlation
    }

metrics = evaluate_phrase_similarity(ds, nlp)
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")


Evaluating test data: 100%|██████████| 8000/8000 [01:07<00:00, 119.35it/s]

accuracy: 0.4434
precision: 0.4422
recall: 0.9924
f1_score: 0.6118
correlation: -0.0155



