In [3]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [4]:
def load_book(path):
  with open(path, 'r') as f:
    return f.read().strip()
    

In [5]:
# break the book into paragraphs
def parse_book(book):
  return [line.strip().replace('\n', ' ') for line in book.split('\n\n') if line.strip()]

In [6]:
moby_dick = parse_book(load_book('../books/2701.txt'))


In [9]:
def generate_embeddings(paragraphs):
  encoded_input = tokenizer(paragraphs, padding=True, truncation=True, return_tensors='pt')
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  return sentence_embeddings

In [10]:
embeddings = torch.zeros((0, 384))
def chunk(list, chunk_n):
    return [list[i:i+chunk_n] for i in range(0, len(list), chunk_n)]

chunks = chunk(moby_dick, 100)
for i, chunk in enumerate(chunks):
  chunk_embeddings = generate_embeddings(chunk)
  embeddings = torch.cat((embeddings, chunk_embeddings))
  print(f'Finished chunk {i} of {len(chunks)}')

Finished chunk 0 of 29
Finished chunk 1 of 29
Finished chunk 2 of 29
Finished chunk 3 of 29
Finished chunk 4 of 29
Finished chunk 5 of 29
Finished chunk 6 of 29
Finished chunk 7 of 29
Finished chunk 8 of 29
Finished chunk 9 of 29
Finished chunk 10 of 29
Finished chunk 11 of 29
Finished chunk 12 of 29
Finished chunk 13 of 29
Finished chunk 14 of 29
Finished chunk 15 of 29
Finished chunk 16 of 29
Finished chunk 17 of 29
Finished chunk 18 of 29
Finished chunk 19 of 29
Finished chunk 20 of 29
Finished chunk 21 of 29
Finished chunk 22 of 29
Finished chunk 23 of 29
Finished chunk 24 of 29
Finished chunk 25 of 29
Finished chunk 26 of 29
Finished chunk 27 of 29
Finished chunk 28 of 29


In [42]:
def search(query, books):
  query_embeddings = generate_embeddings([query])
  results = []
  books_embedding = torch.stack([book['embeddings'] for book in books])
  distances = torch.nn.functional.cosine_similarity(query_embeddings, books_embedding, dim=2)
  top_distances = torch.topk(torch.flatten(distances), k=10).indices
  return [books[i // books_embedding.shape[1]]["content"][i % books_embedding.shape[1]] for i in top_distances]

In [12]:
def create_book(name, path):
  content = parse_book(load_book(path))
  return {
    'name': name,
    'content': content,
    'embeddings': embeddings
  }

In [43]:
X = search("How do you hunt a whale?", [create_book("moby_dick_1", '../books/2701.txt'), create_book("moby_dick_1", '../books/2701.txt')])

torch.Size([2, 2802, 384])


  return [books[i // books_embedding.shape[1]]["content"][i % books_embedding.shape[1]] for i in top_distances]


In [44]:
X

['*The ancient whale-cry upon first sighting a whale from the mast-head, still used by whalemen in hunting the famous Gallipagos terrapin.',
 '*The ancient whale-cry upon first sighting a whale from the mast-head, still used by whalemen in hunting the famous Gallipagos terrapin.',
 'CHAPTER 103. Measurement of The Whale’s Skeleton.',
 'CHAPTER 103. Measurement of The Whale’s Skeleton.',
 'CHAPTER 103. Measurement of The Whale’s Skeleton.',
 'CHAPTER 103. Measurement of The Whale’s Skeleton.',
 'How vain and foolish, then, thought I, for timid untravelled man to try to comprehend aright this wondrous whale, by merely poring over his dead attenuated skeleton, stretched in this peaceful wood. No. Only in the heart of quickest perils; only when within the eddyings of his angry flukes; only on the profound unbounded sea, can the fully invested whale be truly and livingly found out.',
 'How vain and foolish, then, thought I, for timid untravelled man to try to comprehend aright this wondrous