In [None]:
import spacy
import numpy as np

# Load a pre-trained word embedding model (spaCy's small English model)
nlp = spacy.load("en_core_web_sm")

def average_embedding(sentence):
    doc = nlp(sentence)
    vectors = [token.vector for token in doc if token.has_vector]  # Exclude stopwords without vectors
    return np.mean(vectors, axis=0) if vectors else np.zeros((nlp.vocab.vectors_length,))

# Example sentence
sentence = "The dog runs fast"
embedding = average_embedding(sentence)

print("Sentence Embedding (Averaging):", embedding[:5])  # Print first 5 values


Sentence Embedding (Averaging): [ 0.16034171 -0.06278075 -0.51387095 -0.10410072  0.00280273]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "The dog runs fast",
    "A cat sleeps on the sofa",
    "Dogs and cats are great pets"
]

# Compute TF-IDF scores
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
tfidf_scores = vectorizer.transform([sentence]).toarray()[0]
words = vectorizer.get_feature_names_out()

# Compute weighted embeddings
def tfidf_weighted_embedding(sentence):
    doc = nlp(sentence)
    word_embeddings = []
    weights = []

    for token in doc:
        word = token.text.lower()
        if word in words and token.has_vector:
            idx = list(words).index(word)
            word_embeddings.append(token.vector * tfidf_scores[idx])
            weights.append(tfidf_scores[idx])

    return np.sum(word_embeddings, axis=0) / np.sum(weights) if weights else np.zeros((nlp.vocab.vectors_length,))

embedding = tfidf_weighted_embedding(sentence)

print("Sentence Embedding (TF-IDF Weighted):", embedding[:5])  # Print first 5 values


Sentence Embedding (TF-IDF Weighted): [ 0.14707127 -0.04034225 -0.53884666 -0.15503714  0.03409252]


In [None]:
from sklearn.decomposition import PCA

# Generate embeddings for all sentences in corpus
sentence_embeddings = np.array([average_embedding(sent) for sent in corpus])

# Apply PCA to reduce dimensions (e.g., from 300 to 50)
pca = PCA()
reduced_embeddings = pca.fit_transform(sentence_embeddings)

print("Reduced Sentence Embedding (PCA):", reduced_embeddings[0][:5])  # Print first 5 values

Reduced Sentence Embedding (PCA): [-1.2933475e+00 -1.3441613e+00 -1.7296577e-07]


In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute SBERT sentence embedding
sentence = "The dog runs fast"
embedding = model.encode(sentence)

print("Sentence Embedding (SBERT):", embedding[:5])  # Print first 5 values

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence Embedding (SBERT): [0.03234608 0.03101396 0.01024305 0.03830508 0.00155678]


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_similarity(sentence1, sentence2):
    # Encode sentences into embeddings
    embedding1 = model.encode(sentence1)
    embedding2 = model.encode(sentence2)

    # Compute cosine similarity
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]

    return similarity_score

# Example sentences
sentence1 = "A dog is running in the park ."
sentence2 = "A pup is playing in the park."

# Compute similarity
similarity = compute_similarity(sentence1, sentence2)

print(f"Sentence Similarity Score (SBERT): {similarity:.4f}")

Sentence Similarity Score (SBERT): 0.8210
