In [1]:
from transformers import AutoTokenizer, AutoModel

# There are a lot of BERT based models available on HuggingFace,
# and you have to pick one that is suitable for you.
BERT_Model = "bert-base-uncased"

# Initialise the BERT Transformer model
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)
model = AutoModel.from_pretrained(BERT_Model)



In [2]:
# Function to compute the sentence embedding using BERT
def sent_embedding(sent):
    
    # Tokenize the sentence
    # This basically converts the sentence into a sequence of tokens
    # Each token is either a complete word or a sub-word
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    
    # Now feed the tokens into the model and get the embeddings as the output
    outputs = model(**tokens)

    # Create an empty list to store two different kinds of embeddings
    embedding_list = []

    # last_hidden_state contains the output at the last hidden layer of all the sentence tokens
    # pooler_output contains the embedding corresponding to only the [CLS] token, which in a way represents the whole sentence. 
    # This pooler_output is, however, different from the embeddings corresponding to the 1st token of last_hidden_state
    # Although both represent the CLS token, the pooler_output is after some more processing, 
    # and more suitable for use in sentence classification tasks.

    # This stores the embedding corresponding to the CLS token
    embedding_list.append(outputs.last_hidden_state[0][0].detach().numpy().reshape(1,-1))

    # This stores the embedding corresponding to the pooler_output
    embedding_list.append(outputs.pooler_output.detach().numpy())

    return embedding_list

In [12]:
from sentence_transformers import SentenceTransformer, util
import recipe_parser as rp
import os
import pickle

if os.path.exists('pickle_files/recipe_titles.pkl'):
    titles = pickle.load(open('pickle_files/recipe_titles.pkl', 'rb'))
else:
    titles = rp.get_recipe_titles()

# There are several different Sentence Transformer models available on Hugging Face
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

wanted_Recipe = "Codfish"


# Convert the sentences into embeddings using the Sentence Transformer
sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
sent_embedding2 = model.encode(titles,convert_to_tensor=True)

# Find the similarity between the two embeddings
util.pytorch_cos_sim(sent_embedding1, sent_embedding2)

tensor([[0.2617, 0.6348, 0.2378, 0.2325, 0.3252, 0.1701, 0.4627, 0.2055, 0.4179,
         0.1785, 0.1482, 0.1377, 0.2189, 0.2509, 0.2243, 0.2032, 0.0626, 0.1596,
         0.2819, 0.1103, 0.0333, 0.2935, 0.3325, 0.3350, 0.2398, 0.3343, 0.2444,
         0.1697, 0.3402, 0.2394, 0.2323, 0.1738, 0.1048, 0.2437, 0.1600, 0.2170,
         0.2170, 0.1746, 0.2170, 0.2170, 0.4051, 0.1264, 0.1825, 0.2704, 0.1632,
         0.3940, 0.3350, 0.2378, 0.4517, 0.1766, 0.2533, 0.1051, 0.2067, 0.2305,
         0.1678, 0.2556, 0.1304, 0.4647, 0.1665, 0.2848, 0.2848, 0.2759, 0.3143,
         0.2578, 0.3312, 0.2649, 0.2664, 0.3490, 0.1834, 0.1880, 0.2203, 0.4124,
         0.3330, 0.3330, 0.3330, 0.3138, 0.2518, 0.2880, 0.2764, 0.2799, 0.2621,
         0.2860, 0.0537, 0.2077, 0.1462, 0.2375, 0.2561, 0.2733, 0.2375, 0.2561,
         0.1779, 0.2698, 0.1807, 0.2484, 0.3031, 0.2516, 0.3560, 0.3731, 0.2234,
         0.1376, 0.4494, 0.5595, 0.6259, 0.5983, 0.3680, 0.4774, 0.2028, 0.1987,
         0.2088, 0.2317, 0.1

In [13]:
# return the top 5 most similar recipes
top_k = 5
top_results = util.pytorch_cos_sim(sent_embedding1, sent_embedding2)[0].topk(top_k)
for score, idx in zip(top_results[0], top_results[1]):
    print(titles[idx], "(Score: {:.4f})".format(score.item()))
    

Pesto Sauce from Scratch (Score: 0.6681)
How to Make Pesto (Score: 0.6348)
Vegetarian Pasta (Score: 0.6302)
Italian Pasta Salad I (Score: 0.6259)
Pasta Frittata (Score: 0.5992)
