# Pretrain Model (Hugging Face)

In [17]:
import numpy as np

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
# tokenize
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors = 'pt')

# compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [9]:
sent_a_emb = model_output['last_hidden_state'][0].squeeze(0).reshape(-1,1).squeeze(-1)
sent_b_emb = model_output['last_hidden_state'][1].squeeze(0).reshape(-1,1).squeeze(-1)

In [7]:
def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [10]:
result = cosine_similarity(sent_a_emb, sent_b_emb)
print(result)

0.24433926


In [14]:
def tokenize (sentences):
    # tokenize
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors = 'pt')

    # compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    sent_a_emb = model_output['last_hidden_state'][0].squeeze(0).reshape(-1,1).squeeze(-1)
    sent_b_emb = model_output['last_hidden_state'][1].squeeze(0).reshape(-1,1).squeeze(-1)
    
    return sent_a_emb, sent_b_emb

In [12]:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."

In [13]:
# Sentences we want sentence embeddings for
sentences = [sentence_a, sentence_b]

In [15]:
a, b = tokenize(sentences)

In [16]:
cosine_similarity(a,b)

0.38588038