In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path(sys.path[0]).parent))
from src.ingredient_tokenizer import load_tokenizer, get_tokenizer_vocab
from transformers import pipeline

tokenizer = load_tokenizer('../artifacts')

fill_mask = pipeline("fill-mask", 
                     model='../artifacts/', 
                     tokenizer=tokenizer,
                     topk=5)
model = fill_mask.model

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../artifacts/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
fill_mask('dough pizza_sauce <mask>')

[{'sequence': '<s> dough pizza_sauce mozzarella_cheese </s>',
  'score': 0.23033706843852997,
  'token': 156,
  'token_str': 'mozzarella_cheese'},
 {'sequence': '<s> dough pizza_sauce pepperoni </s>',
  'score': 0.20622113347053528,
  'token': 733,
  'token_str': 'pepperoni'},
 {'sequence': '<s> dough pizza_sauce olive_oil </s>',
  'score': 0.0663054808974266,
  'token': 12,
  'token_str': 'olive_oil'},
 {'sequence': '<s> dough pizza_sauce cheese </s>',
  'score': 0.06030401214957237,
  'token': 21,
  'token_str': 'cheese'},
 {'sequence': '<s> dough pizza_sauce parmesan </s>',
  'score': 0.03762682154774666,
  'token': 49,
  'token_str': 'parmesan'}]

In [3]:
tokenized_input = tokenizer('dough pizza_sauce <mask>', return_tensors='pt')['input_ids']
tokenized_input

tensor([[  0, 431, 856,   4,   2]])

In [4]:
model(tokenized_input)

(tensor([[[ 20.7888,  -5.7311,  -2.3179,  ...,  -1.1549,   1.2096,  -0.6517],
          [ -3.5585, -10.7750,  -3.0984,  ...,  -2.4188,  -3.7403,  -3.7300],
          [ -4.2900, -10.2704,  -1.8199,  ...,  -1.9546,  -2.8219,  -1.6147],
          [ -3.2282, -12.2347,  -5.2322,  ...,  -2.8265,  -3.5606,  -4.4717],
          [ -3.3837,  -7.7216,  17.7582,  ...,  -4.7777,  -3.7840,  -2.4419]]],
        grad_fn=<AddBackward0>),)

In [5]:
x = tokenized_input
x = model.roberta.embeddings(x)
x = model.roberta.encoder(x)
x = model.lm_head(x[0])
x

tensor([[[ 20.7888,  -5.7311,  -2.3179,  ...,  -1.1549,   1.2096,  -0.6517],
         [ -3.5585, -10.7750,  -3.0984,  ...,  -2.4188,  -3.7403,  -3.7300],
         [ -4.2900, -10.2704,  -1.8199,  ...,  -1.9546,  -2.8219,  -1.6147],
         [ -3.2282, -12.2347,  -5.2322,  ...,  -2.8265,  -3.5606,  -4.4717],
         [ -3.3837,  -7.7216,  17.7582,  ...,  -4.7777,  -3.7840,  -2.4419]]],
       grad_fn=<AddBackward0>)

In [6]:
from torchvision.models import 

In [8]:
m = resnet18(False)

In [20]:
sum([p.numel() for p in list(m.parameters())])

11689512

# Embeddings

In [21]:
embeddings = model.roberta.embeddings.word_embeddings.weight.detach().numpy()

In [22]:
ingredients = get_tokenizer_vocab(tokenizer)

In [23]:
from gensim.models import KeyedVectors

In [24]:
len(embeddings), len(ingredients)

(3505, 3505)

In [25]:
embeddings_model = KeyedVectors(768)
embeddings_model.add(ingredients, embeddings)

In [26]:
embeddings_model.get_vector('ham')

array([ 4.19010781e-02, -1.44569215e-03,  1.52035421e-02, -1.22054724e-03,
       -2.36960258e-02, -4.02079755e-03, -4.23919484e-02, -7.11792056e-03,
        1.79526806e-02, -1.98006183e-02,  2.64843311e-02,  3.48729342e-02,
       -5.82780130e-03, -2.05886662e-02,  5.71692083e-03, -5.18989004e-02,
       -1.06926113e-02,  2.67722271e-02, -4.05197740e-02,  4.16247454e-03,
        8.59029740e-02, -2.03046333e-02,  1.31649422e-02, -1.91716589e-02,
       -2.67268885e-02, -3.41102332e-02,  6.17067367e-02,  2.69439421e-03,
       -7.66160060e-03,  2.23249942e-02, -3.77335064e-02, -1.90900303e-02,
       -9.87960026e-03,  4.58221249e-02, -7.50269787e-03, -1.04591455e-02,
       -4.56144363e-02,  4.27748822e-02,  1.64360926e-02, -9.18315817e-03,
       -1.25347491e-04,  2.31818464e-02, -1.83910280e-02, -2.19459515e-02,
        2.19262508e-03,  2.09313035e-02, -1.63532756e-02,  3.58854309e-02,
       -4.34484929e-02,  2.44188420e-02, -3.10478471e-02, -8.87274568e-04,
       -8.07295553e-03, -

In [27]:
embeddings_model.most_similar('ham')

[('smoked_ham', 0.39625677466392517),
 ('cooked_ham', 0.3473472595214844),
 ('canadian_bacon', 0.31826114654541016),
 ('sliced_ham', 0.28591668605804443),
 ('corned_beef', 0.2710075378417969),
 ('boiled_ham', 0.2634613513946533),
 ('deli_ham', 0.2594316601753235),
 ('fully_cooked_ham', 0.2330356240272522),
 ('mortadella', 0.21839934587478638),
 ('mexican_chorizo', 0.21640340983867645)]

In [28]:
embeddings_model.most_similar('sugar')

[('granulated_sugar', 0.3818522095680237),
 ('brown_sugar', 0.3476158380508423),
 ('white_sugar', 0.28680968284606934),
 ('light_brown_sugar', 0.225079745054245),
 ('honey', 0.2248568832874298),
 ('water', 0.2237204909324646),
 ('powdered_sugar', 0.1913958191871643),
 ('dark_brown_sugar', 0.17984548211097717),
 ('salt', 0.17680180072784424),
 ('garnish', 0.1673145890235901)]

In [29]:
embeddings_model.most_similar(positive=["butter"], negative=["salt"], topn=1)

[('unsalted_butter', 0.27746471762657166)]

In [30]:
embeddings_model.most_similar(positive=["ground_beef", "pork"], negative=["beef"], topn=1)

[('ground_pork', 0.28692853450775146)]

In [31]:
embeddings_model.most_similar(positive=["dried_oregano", "sage"], negative=["oregano"], topn=1)

[('dried_sage', 0.24299708008766174)]

In [32]:
embeddings_model.most_similar(positive=["hot_water", "butter"], negative=["water"], topn=1)

[('melted_butter', 0.29524338245391846)]

In [33]:
embeddings_model.most_similar(positive=["hot_water", "hot_water"], negative=["water"], topn=1)

[('boiling_water', 0.2949616014957428)]

In [34]:
embeddings_model.most_similar(positive=["hot_water", "milk"], negative=["water"], topn=1)

[('evaporated_milk', 0.308748334646225)]

In [78]:
embeddings_model.most_similar(positive=["hot_water"], negative=["water"], topn=1)

KeyError: "word 'coffee_granuals' not in vocabulary"

In [35]:
embeddings_model.most_similar(positive=["hot_water", "bread"], negative=["water"], topn=1)

[('himalayan_salt', 0.29187968373298645)]

In [43]:
embeddings_model.most_similar(positive=["carrots", "potato"], negative=["carrot"], topn=1)

[('potatoes', 0.24644336104393005)]

In [44]:
embeddings_model.most_similar(positive=["carrots", "onion"], negative=["carrot"], topn=1)

[('onions', 0.2654738426208496)]

In [64]:
embeddings_model.most_similar(positive=["carrots", "tomato"], negative=["carrot"], topn=1)

[('tomatoes', 0.2559482455253601)]

In [66]:
embeddings_model.most_similar(positive=["carrots", "garlic"], negative=["carrot"], topn=1)

[('garlic_cloves', 0.3597503900527954)]

In [67]:
embeddings_model.most_similar(positive=["carrots", "egg"], negative=["carrot"], topn=1)

[('eggs', 0.3555490970611572)]

In [68]:
embeddings_model.most_similar(positive=["carrots", "apple"], negative=["carrot"], topn=1)

[('granny_smith_apples', 0.26275596022605896)]

In [69]:
embeddings_model.most_similar(positive=["carrots", "basil"], negative=["carrot"], topn=1)

[('basil_leaves', 0.23774810135364532)]

In [71]:
embeddings_model.most_similar(positive=["carrots", "ice"], negative=["carrot"], topn=1)

[('ice_cubes', 0.22948268055915833)]

In [72]:
embeddings_model.most_similar(positive=["carrots", "bacon"], negative=["carrot"], topn=1)

[('cooked_bacon', 0.2114761620759964)]

In [83]:
embeddings_model.most_similar(positive=["carrots", "chicken"], negative=["carrot"], topn=1)

[('venison', 0.1553337275981903)]

In [63]:
[i for i in ingredients if i.endswith('es')]

['tomatoes',
 'garlic_cloves',
 'leaves',
 'potatoes',
 'cloves',
 'pepper_flakes',
 'red_pepper_flakes',
 'noodles',
 'egg_whites',
 'chives',
 'olives',
 'apples',
 'strawberries',
 'bay_leaves',
 'basil_leaves',
 'chilies',
 'cranberries',
 'blueberries',
 'chicken_breast_halves',
 'parsley_leaves',
 'ground_cloves',
 'molasses',
 'cherry_tomatoes',
 'black_olives',
 'green_chilies',
 'raspberries',
 'plum_tomatoes',
 'chiles',
 'sweet_potatoes',
 'cherries',
 'cilantro_leaves',
 'mint_leaves',
 'thyme_leaves',
 'oranges',
 'dried_cranberries',
 'vegetables',
 'peaches',
 'limes',
 'cookies',
 'spinach_leaves',
 'ice_cubes',
 'medium_tomatoes',
 'egg_noodles',
 'preserves',
 'dates',
 'lettuce_leaves',
 'boneless_skinless_chicken_breast_halves',
 'roma_tomatoes',
 'medium_potatoes',
 'grapes',
 'russet_potatoes',
 'boneless_chicken_breast_halves',
 'green_olives',
 'sundried_tomatoes',
 'gold_potatoes',
 'grape_tomatoes',
 'pecan_halves',
 'yukon_gold_potatoes',
 'oregano_leaves',
 

In [62]:
embeddings_model.most_similar(positive=["blueberries", "strawberry"], negative=["blueberry"], topn=1)

[('frozen_blueberries', 0.22762975096702576)]

In [23]:
# from tensorboardX import SummaryWriter
# writer = SummaryWriter()
# writer.add_embedding(embeddings, metadata=ingredients)

In [92]:
import re
re.findall('\d', 'a')

[]