In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path(sys.path[0]).parent))
from src.ingredient_tokenizer import load_tokenizer, get_tokenizer_vocab
from transformers import pipeline

tokenizer = load_tokenizer('../artifacts')

fill_mask = pipeline("fill-mask", 
                     model='../artifacts/', 
                     tokenizer=tokenizer,
                     topk=5)
model = fill_mask.model

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../artifacts/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Embeddings

In [2]:
embeddings = model.roberta.embeddings.word_embeddings.weight.detach().numpy()

In [3]:
ingredients = get_tokenizer_vocab(tokenizer)

In [4]:
from gensim.models import KeyedVectors

In [5]:
embeddings_model = KeyedVectors(768)
embeddings_model.add(ingredients, embeddings)

In [6]:
embeddings_model.get_vector('ham')

array([ 4.19010781e-02, -1.44569215e-03,  1.52035421e-02, -1.22054724e-03,
       -2.36960258e-02, -4.02079755e-03, -4.23919484e-02, -7.11792056e-03,
        1.79526806e-02, -1.98006183e-02,  2.64843311e-02,  3.48729342e-02,
       -5.82780130e-03, -2.05886662e-02,  5.71692083e-03, -5.18989004e-02,
       -1.06926113e-02,  2.67722271e-02, -4.05197740e-02,  4.16247454e-03,
        8.59029740e-02, -2.03046333e-02,  1.31649422e-02, -1.91716589e-02,
       -2.67268885e-02, -3.41102332e-02,  6.17067367e-02,  2.69439421e-03,
       -7.66160060e-03,  2.23249942e-02, -3.77335064e-02, -1.90900303e-02,
       -9.87960026e-03,  4.58221249e-02, -7.50269787e-03, -1.04591455e-02,
       -4.56144363e-02,  4.27748822e-02,  1.64360926e-02, -9.18315817e-03,
       -1.25347491e-04,  2.31818464e-02, -1.83910280e-02, -2.19459515e-02,
        2.19262508e-03,  2.09313035e-02, -1.63532756e-02,  3.58854309e-02,
       -4.34484929e-02,  2.44188420e-02, -3.10478471e-02, -8.87274568e-04,
       -8.07295553e-03, -

# Similarity

In [7]:
embeddings_model.most_similar('ham')

[('smoked_ham', 0.39625677466392517),
 ('cooked_ham', 0.3473472595214844),
 ('canadian_bacon', 0.31826114654541016),
 ('sliced_ham', 0.28591668605804443),
 ('corned_beef', 0.2710075378417969),
 ('boiled_ham', 0.2634613513946533),
 ('deli_ham', 0.2594316601753235),
 ('fully_cooked_ham', 0.2330356240272522),
 ('mortadella', 0.21839934587478638),
 ('mexican_chorizo', 0.21640340983867645)]

In [8]:
embeddings_model.most_similar('sugar')

[('granulated_sugar', 0.3818522095680237),
 ('brown_sugar', 0.3476158380508423),
 ('white_sugar', 0.28680968284606934),
 ('light_brown_sugar', 0.225079745054245),
 ('honey', 0.2248568832874298),
 ('water', 0.2237204909324646),
 ('powdered_sugar', 0.1913958191871643),
 ('dark_brown_sugar', 0.17984548211097717),
 ('salt', 0.17680180072784424),
 ('garnish', 0.1673145890235901)]

# Relationships between ingredients

In [9]:
embeddings_model.most_similar(positive=["table_salt", 'ground_black_pepper'], topn=1)

[('salt_and_ground_black_pepper', 0.3656250238418579)]

In [10]:
embeddings_model.most_similar(positive=["butter"], negative=["salt"], topn=1)

[('unsalted_butter', 0.27746471762657166)]

In [12]:
embeddings_model.most_similar(positive=["carrots", "onion"], negative=["carrot"], topn=1)

[('onions', 0.2654738426208496)]

In [13]:
def pluralize(ingredient):
    return (embeddings_model.
            most_similar(positive=["carrots", ingredient], negative=["carrot"], topn=1))[0][0]

In [14]:
pluralize('onion')

'onions'

In [15]:
pluralize('tomato')

'tomatoes'

In [16]:
pluralize('garlic')

'garlic_cloves'

In [17]:
pluralize('basil')

'basil_leaves'

In [18]:
pluralize('ice')

'ice_cubes'

In [19]:
embeddings_model.most_similar(positive=["ground_beef", "pork"], negative=["beef"], topn=1)

[('ground_pork', 0.28692853450775146)]

In [20]:
embeddings_model.most_similar(positive=["dried_oregano", "sage"], negative=["oregano"], topn=1)

[('dried_sage', 0.24299708008766174)]

In [21]:
embeddings_model.most_similar(positive=["hot_water", "butter"], negative=["water"], topn=1)

[('melted_butter', 0.29524338245391846)]

In [22]:
embeddings_model.most_similar(positive=["hot_water", "hot_water"], negative=["water"], topn=1)

[('boiling_water', 0.2949616014957428)]

In [23]:
embeddings_model.most_similar(positive=["hot_water", "milk"], negative=["water"], topn=1)

[('evaporated_milk', 0.308748334646225)]