In [2]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings.shape)

Sentence embeddings:
torch.Size([2, 384])


In [3]:
def load_book(path):
  with open(path, 'r') as f:
    return f.read().strip()

In [4]:
# break the book into paragraphs
def parse_book(book):
  return [line.strip().replace('\n', ' ') for line in book.split('\n\n') if line.strip()]


In [5]:
moby_dick = parse_book(load_book('../books/2701.txt'))

In [6]:
moby_dick[-10:]

['Diving beneath the settling ship, the whale ran quivering along its keel; but turning under water, swiftly shot to the surface again, far off the other bow, but within a few yards of Ahab’s boat, where, for a time, he lay quiescent.',
 '“I turn my body from the sun. What ho, Tashtego! let me hear thy hammer. Oh! ye three unsurrendered spires of mine; thou uncracked keel; and only god-bullied hull; thou firm deck, and haughty helm, and Pole-pointed prow,—death-glorious ship! must ye then perish, and without me? Am I cut off from the last fond pride of meanest shipwrecked captains? Oh, lonely death on lonely life! Oh, now I feel my topmost greatness lies in my topmost grief. Ho, ho! from all your furthest bounds, pour ye now in, ye bold billows of my whole foregone life, and top this one piled comber of my death! Towards thee I roll, thou all-destroying but unconquering whale; to the last I grapple with thee; from hell’s heart I stab at thee; for hate’s sake I spit my last breath at th

In [7]:
def generate_embeddings(paragraphs):
  encoded_input = tokenizer(paragraphs, padding=True, truncation=True, return_tensors='pt')
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  return sentence_embeddings

In [8]:
generate_embeddings(moby_dick[1000:])

: 

: 