In [1]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # Choose a BERT variant
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Input text
text = "BERT embeddings are powerful for NLP tasks."
#text = ""

# Tokenize the text
inputs = tokenizer(
    text,
    return_tensors="pt",  # Return PyTorch tensors
    max_length=512,  # Maximum sequence length for BERT
    truncation=True,  # Truncate long sequences
    padding="max_length"  # Pad short sequences to max length
)

print(inputs)


{'input_ids': tensor([[  101, 14324,  7861,  8270,  4667,  2015,  2024,  3928,  2005, 17953,
          2361,  8518,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [16]:
# Pass inputs through the model
with torch.no_grad():  # Disable gradient computation for inference
    outputs = model(**inputs)

# Outputs contain two components:
# - last_hidden_state: Embeddings for each token
# - pooler_output: CLS token embedding (sentence-level representation)
token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)
sentence_embedding = outputs.pooler_output  # Shape: (batch_size, hidden_size)

print("Token Embeddings:", token_embeddings.shape)
print("Sentence Embedding:", sentence_embedding.shape)


Token Embeddings: torch.Size([1, 512, 768])
Sentence Embedding: torch.Size([1, 768])


In [17]:
# Example of mean pooling for sentence embedding
attention_mask = inputs['attention_mask'].squeeze(0)  # Mask for valid tokens (1 = valid, 0 = padding)

# Compute mean pooling
token_embeddings = token_embeddings.squeeze(0)  # Remove batch dimension
valid_token_embeddings = token_embeddings[attention_mask.bool()]  # Select valid tokens
mean_embedding = valid_token_embeddings.mean(dim=0)  # Average embeddings

print("Mean Sentence Embedding:", mean_embedding.shape)


Mean Sentence Embedding: torch.Size([768])


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Tokenize and create embeddings
sentences = ["AI for BPH Surgical Decision-Making: Cost Effectiveness and Outcomes"
             ,"The Promise of Artificial Intelligence in Peyronie’s Disease"]
em = []
for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average token embeddings
    em.append(embeddings)


similarity = cosine_similarity(em[0], em[1])
print(similarity)

[[0.24413648]]


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "Data science is an interdisciplinary field.",
    "Machine learning is a part of data science.",
    "Data science and artificial intelligence overlap.",
]

# Create the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Get feature names (words) and display the matrix
feature_names = vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()

print("Feature Names:")
print(feature_names)
print("\nTF-IDF Matrix:")
for doc_idx, doc_tfidf in enumerate(tfidf_array):
    print(f"Document {doc_idx+1}: {doc_tfidf}")


Feature Names:
['an' 'and' 'artificial' 'data' 'field' 'intelligence' 'interdisciplinary'
 'is' 'learning' 'machine' 'of' 'overlap' 'part' 'science']

TF-IDF Matrix:
Document 1: [0.48359121 0.         0.         0.28561676 0.48359121 0.
 0.48359121 0.36778358 0.         0.         0.         0.
 0.         0.28561676]
Document 2: [0.         0.         0.         0.25712876 0.         0.
 0.         0.3311001  0.43535684 0.43535684 0.43535684 0.
 0.43535684 0.25712876]
Document 3: [0.         0.46138073 0.46138073 0.27249889 0.         0.46138073
 0.         0.         0.         0.         0.         0.46138073
 0.         0.27249889]


In [None]:
from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("tomaarsen/glove-wikipedia-tf-idf")
# Run inference
sentences = [
    'A woman is dancing.',
    'A man is dancing.',
    'A brown horse in a green field.',
]
embeddings = model.encode(sentences)lkl
print(embeddings.shape)
# [3, 300]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings)
print(similarities.shape)
# [3, 3]
