#🧪 Practical: Text Similarity Checker using Embeddings + Cosine Similarity

🔹 Step 1: Install Required Libraries

In [1]:
!pip install -q transformers sentence-transformers scikit-learn


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

🔹 Step 2: Import Libraries

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


#🔹 Step 3: Load Pretrained Embedding Model
We’ll use all-MiniLM-L6-v2 – fast and effective for semantic similarity.

In [3]:
# Load the pretrained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔹 Step 4: Define Sample Texts

In [4]:
# Define two example texts (can be anything)
text1 = "Machine learning helps computers learn from data."
text2 = "AI allows machines to learn and make decisions from data."

# Optional: Try an unrelated example
# text2 = "The Eiffel Tower is in Paris."


🔹 Step 5: Compute Sentence Embeddings

In [5]:
# Convert texts to embeddings
embeddings = model.encode([text1, text2])

# Check shape: (2, 384)
print("Embedding shape:", embeddings.shape)


Embedding shape: (2, 384)


🔹 Step 6: Calculate Cosine Similarity

In [6]:
# Calculate cosine similarity between the two embeddings
similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# Print the result
print(f"\n🧠 Cosine Similarity Score: {similarity_score:.4f}")



🧠 Cosine Similarity Score: 0.7310


🔹 Step 7: Interpret the Score

In [7]:
# Simple interpretation logic
if similarity_score > 0.8:
    print("✅ The texts are highly similar.")
elif similarity_score > 0.5:
    print("🟡 The texts are somewhat similar.")
else:
    print("❌ The texts are not very similar.")


🟡 The texts are somewhat similar.


#🚀 Optional Enhancements
Compare multiple texts at once (batch processing)

Visualize similarities in a heatmap (for document sets)

Use more advanced models (all-mpnet-base-v2, paraphrase-MiniLM, etc.)