In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

def get_text_embeddings(text, model_name="distilbert-base-uncased"):
    """
    Encode a piece of text into text embeddings using Hugging Face Transformers.

    Args:
        text (str): The input text to be encoded.
        model_name (str): The pre-trained model to use. Default is DistilBERT.

    Returns:
        torch.Tensor: A tensor containing text embeddings for the input text.
    """
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize the input text
    input_ids = tokenizer(text, return_tensors="pt", padding=True, truncation=True).input_ids

    # Pass the tokenized input through the model to obtain embeddings
    with torch.no_grad():
        embeddings = model(input_ids).last_hidden_state.mean(dim=1)  # Average pooling over tokens

    return embeddings

# Example usage:
input_text = "This is an example sentence for text embeddings."
embeddings = get_text_embeddings(input_text)

# 'embeddings' now contains the text embeddings for the input text
print(embeddings)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tensor([[-1.0330e-01, -1.6333e-02, -2.1666e-01, -1.0828e-01, -3.1989e-02,
         -2.4618e-01,  1.4542e-01,  4.6435e-01,  9.3738e-02,  2.4434e-02,
         -3.8824e-01, -2.7084e-01, -3.2446e-01, -3.7958e-02, -1.1562e-01,
          3.3123e-01, -7.0980e-02,  1.8767e-01, -4.1543e-02, -2.0214e-01,
          2.6552e-01,  2.4957e-01, -2.7237e-01,  1.6857e-01,  6.6935e-01,
         -1.9383e-01,  1.2050e-01, -5.1234e-02, -4.9751e-01, -4.2159e-02,
          2.8471e-01,  3.5742e-01, -7.5617e-02, -1.1991e-01,  2.1094e-02,
         -1.0592e-01,  3.4045e-01, -2.1545e-01, -6.7204e-02,  2.5641e-01,
         -4.7964e-01, -1.2802e-01,  2.5541e-01,  6.0741e-02,  7.4723e-02,
         -3.9563e-01, -1.7565e-01, -2.5842e-02, -8.8910e-03, -3.5095e-02,
         -9.9681e-01,  2.8377e-01,  1.4295e-01,  2.7406e-01, -1.2784e-01,
          5.4545e-01,  1.8793e-03, -6.5151e-01,  1.2896e-01, -1.6064e-01,
          6.6524e-02,  7.9114e-02, -5.5293e-02, -3.8512e-01,  3.3144e-01,
          2.3010e-01, -1.5899e-01,  3.

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173552e-02 -4.28515449e-02 -1.56286024e-02  1.40537303e-02
  3.95537727e-02  1.21796280e-01  2.94334106e-02 -3.17524187e-02
  3.54959629e-02 -7.93139935e-02  1.75878741e-02 -4.04369719e-02
  4.97259349e-02  2.54912246e-02 -7.18700588e-02  8.14968869e-02
  1.47069141e-03  4.79626991e-02 -4.50336412e-02 -9.92174670e-02
 -2.81769745e-02  6.45046085e-02  4.44670543e-02 -4.76217009e-02
 -3.52952331e-02  4.38671783e-02 -5.28566055e-02  4.33063833e-04
  1.01921506e-01  1.64072234e-02  3.26996595e-02 -3.45986746e-02
  1.21339476e-02  7.94870779e-02  4.58345609e-03  1.57777797e-02
 -9.68206208e-03  2.87625659e-02 -5.05805984e-02 -1.55793717e-02
 -2.87906546e-02 -9.62280575e-03  3.15556750e-02  2.27349028e-02
  8.71449187e-02 -3.85027491e-02 -8.84718448e-02 -8.75498448e-03
 -2.12343335e-02  2.08923239e-02 -9.02077407e-02 -5.25732562e-02
 -1.05638904e-02  2.88310610e-02 -1.61455162e-02  6.17837207e-03
 -1.23234

In [9]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(len(embeddings))
print(embeddings[0].shape)
print(embeddings[1].shape)

#https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

2
(384,)
(384,)
