In [6]:
# !python -m spacy download en_core_web_lg

In [7]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

In [8]:
# Get word vectors
word1 = nlp("king")
word2 = nlp("queen")

# Calculate similarity score
similarity_score = word1.similarity(word2)

print(f"Similarity between '{word1}' and '{word2}': {similarity_score:.4f}")

Similarity between 'king' and 'queen': 0.7253


In [11]:
# Evaluate the pre trained spacy model
import pandas as pd
from scipy.stats import pearsonr, spearmanr

def evaluate_spacy_model(test_dataset_path):
    """
    Evaluates spaCy's pre-trained model on a test dataset with human-rated similarity scores.

    Args:
        test_dataset_path (str): Path to the test dataset (SimLex-999).

    Returns:
        dict: Pearson and Spearman correlation scores.
    """
    # Load the spaCy model
    nlp = spacy.load("en_core_web_lg")

    # Load the test dataset
    test_data = pd.read_csv(test_dataset_path, delimiter="\t")

    # Extract word pairs and human similarity scores
    word_pairs = test_data[['word1', 'word2']]
    human_scores = test_data['SimLex999']

    # Compute spaCy-predicted similarity scores
    predicted_scores = []
    for _, row in word_pairs.iterrows():
        word1_vec = nlp(row['word1'])
        word2_vec = nlp(row['word2'])
        predicted_scores.append((word1_vec.similarity(word2_vec))*10)

    # Calculate Pearson and Spearman correlations
    pearson_corr, _ = pearsonr(human_scores, predicted_scores)
    spearman_corr, _ = spearmanr(human_scores, predicted_scores)

    return {
        "Pearson Correlation": pearson_corr,
        "Spearman Correlation": spearman_corr
    }

test_dataset_path = "SimLex-999.txt"
evaluation_results = evaluate_spacy_model(test_dataset_path)
print("Evaluation Results:")
print(f"Pearson Correlation: {evaluation_results['Pearson Correlation']:.4f}")
print(f"Spearman Correlation: {evaluation_results['Spearman Correlation']:.4f}")


Evaluation Results:
Pearson Correlation: 0.4367
Spearman Correlation: 0.4083
