In [1]:
from transformers import AutoTokenizer, AutoModel

# There are a lot of BERT based models available on HuggingFace,
# and you have to pick one that is suitable for you.
BERT_Model = "bert-base-uncased"

# Initialise the BERT Transformer model
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)
model = AutoModel.from_pretrained(BERT_Model)



In [2]:
# Function to compute the sentence embedding using BERT
def sent_embedding(sent):
    
    # Tokenize the sentence
    # This basically converts the sentence into a sequence of tokens
    # Each token is either a complete word or a sub-word
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    
    # Now feed the tokens into the model and get the embeddings as the output
    outputs = model(**tokens)

    # Create an empty list to store two different kinds of embeddings
    embedding_list = []

    # last_hidden_state contains the output at the last hidden layer of all the sentence tokens
    # pooler_output contains the embedding corresponding to only the [CLS] token, which in a way represents the whole sentence. 
    # This pooler_output is, however, different from the embeddings corresponding to the 1st token of last_hidden_state
    # Although both represent the CLS token, the pooler_output is after some more processing, 
    # and more suitable for use in sentence classification tasks.

    # This stores the embedding corresponding to the CLS token
    embedding_list.append(outputs.last_hidden_state[0][0].detach().numpy().reshape(1,-1))

    # This stores the embedding corresponding to the pooler_output
    embedding_list.append(outputs.pooler_output.detach().numpy())

    return embedding_list

In [18]:
import sys
sys.path.insert(0, '../Phase 1')

from sentence_transformers import SentenceTransformer, util
import recipe_parser as rp
import os
import pickle

if os.path.exists('pickle_files/recipe_titles.pkl'):
    titles = pickle.load(open('pickle_files/recipe_titles.pkl', 'rb'))
else:
    titles = rp.get_recipe_titles()

# There are several different Sentence Transformer models available on Hugging Face
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

wanted_Recipe = "Chicken"


# Convert the sentences into embeddings using the Sentence Transformer
sent_embedding1 = model.encode(wanted_Recipe,convert_to_tensor=True)
sent_embedding2 = model.encode(titles,convert_to_tensor=True)

# Find the similarity between the two embeddings
util.pytorch_cos_sim(sent_embedding1, sent_embedding2)

tensor([[ 4.2901e-01,  9.7670e-02,  4.8259e-02,  9.0336e-02,  1.2619e-01,
          1.2014e-01,  1.4150e-01,  1.0840e-01,  1.1864e-01,  1.5965e-01,
          1.4042e-01,  2.6605e-01,  2.6470e-01,  1.4448e-01,  2.4267e-01,
          2.4683e-01,  1.9597e-01,  1.8027e-01,  1.6995e-01,  2.0672e-01,
          4.3776e-02,  3.1589e-01,  3.0971e-01,  3.3430e-01,  2.6817e-01,
          5.6125e-01,  2.6984e-01,  1.3486e-01,  2.6009e-01,  2.0989e-01,
          3.8190e-01,  1.7731e-01,  1.4406e-01,  1.0420e-01,  1.8408e-01,
          5.7358e-01,  5.7358e-01,  2.5568e-01,  5.7358e-01,  5.7358e-01,
          3.3881e-01,  1.4773e-01,  5.1501e-01,  1.6463e-01,  2.9453e-01,
          2.4629e-01,  2.0364e-01,  2.9032e-01,  3.3123e-01,  2.9442e-01,
          4.1454e-01,  1.0094e-01,  1.5092e-01,  5.0996e-01,  1.3142e-01,
          2.3048e-01,  4.8943e-02,  2.3354e-01,  8.2769e-05,  4.3619e-01,
          4.3619e-01,  4.0583e-01,  4.0630e-01,  4.0131e-01,  4.3604e-01,
          3.8936e-01,  2.2135e-01,  2.

In [19]:
# return the top 5 most similar recipes
top_k = 5
top_results = util.pytorch_cos_sim(sent_embedding1, sent_embedding2)[0].topk(top_k)
for score, idx in zip(top_results[0], top_results[1]):
    print(titles[idx], "(Score: {:.4f})".format(score.item()))
    

Home-Roasted Chicken (Score: 0.7254)
English Roast Chicken (Score: 0.7054)
Chicken Parmesan (Score: 0.6722)
Chicken Parmesan (Score: 0.6722)
Chicken Parmesan (Score: 0.6722)
