In [11]:
corpus_1 = [
    "the cat sat on the mat",
    "the dog sat on the mat",
    "the mat is on the floor"
]

corpus_2 = [
    "artificial intelligence is the future",
    "machine learning is a subset of artificial intelligence",
    "the future of AI is bright"
]

corpus_3 = ["NLP is evolving fast", "Machine learning is evolving", "Future of NLP is bright"]

corpus_4 = [
    "Machine learning improves decision-making in businesses.",
    "Businesses use machine learning for data analysis.",
    "Deep learning and neural networks outperform traditional models."
]

In [12]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize input sentences
sentences = ["How do I start learning Python?", "What are the first steps to learning Python?"]
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Get BERT embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Extract sentence embeddings using the [CLS] token
sentence_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

# Compute cosine similarity
similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])
print("BERT Similarity Score:", similarity_score[0][0])


BERT Similarity Score: 0.9618722


In [13]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define sentences
sentences = ["How do I start learning Python?", "What are the first steps to learning Python?"]

# Compute SBERT embeddings
sentence_embeddings = model.encode(sentences)

# Compute cosine similarity
similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])
print("SBERT Similarity Score:", similarity_score[0][0])


SBERT Similarity Score: 0.92464805


In [22]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize input sentences
sentences = corpus_2
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Get BERT embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Extract sentence embeddings using the [CLS] token
sentence_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

# Compute cosine similarity
similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])
print(corpus_2[0])
print(corpus_2[1])
print("BERT Similarity Score:", similarity_score[0][0])


artificial intelligence is the future
machine learning is a subset of artificial intelligence
BERT Similarity Score: 0.8469002


In [21]:
print(corpus_2[0])
print(corpus_2[2])
similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[2]])
print("BERT Similarity Score:", similarity_score[0][0])

artificial intelligence is the future
the future of AI is bright
BERT Similarity Score: 0.8215015


In [24]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define sentences
sentences = corpus_2

# Compute SBERT embeddings
sentence_embeddings = model.encode(sentences)

# Compute cosine similarity
similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])
print(corpus_2[0])
print(corpus_2[1])
print("SBERT Similarity Score:", similarity_score[0][0])


artificial intelligence is the future
machine learning is a subset of artificial intelligence
SBERT Similarity Score: 0.5939864


In [23]:
similarity_score = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[2]])
print(corpus_2[0])
print(corpus_2[2])
print("SBERT Similarity Score:", similarity_score[0][0])

artificial intelligence is the future
the future of AI is bright
SBERT Similarity Score: 0.94385934


In [25]:
from transformers import pipeline

# Load BERT fill-mask pipeline
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Test sentence with a masked word
sentence = "The quick brown [MASK] jumps over the lazy dog."

# Get BERT predictions
predictions = fill_mask(sentence)

# Print top 3 predictions
for pred in predictions[:3]:
    print(f"Predicted: {pred['token_str']} (Confidence: {pred['score']:.4f})")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


Predicted: cat (Confidence: 0.2298)
Predicted: dog (Confidence: 0.1143)
Predicted: bear (Confidence: 0.0780)
