# Masked language models

In [1]:
from transformers import pipeline

fill_mask = pipeline("fill-mask")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
fill_mask("Share your Opinion Shop & Search to earn real cash rewards!")

# Training on recipes

https://huggingface.co/blog/how-to-train

In [6]:
!head ../data/processed/recipes_train.txt

garlic eggplant flour salt olive_oil seasoned_bread_crumbs grated_parmesan_cheese pepper rounds mozzarella_cheese tomatoes
eggs oreo cookies margarine water sugar_cookie_mix brownie ingredients
nutmeg cinnamon_sticks apple_juice orange_blossom water tea_bags
red_chili pepper_sauce chili_paste olive_oil pork_loin mirin cornstarch oyster_sauce
medium_shrimp dried_red_pepper fillets olive_oil fennel_fronds garlic_cloves bottled_clam_juice dry_white_wine lemon arugula
black_beans salt olive_oil avocado cilantro bell_pepper ground_cumin red_onion lime_juice
dark_chocolate eggs salt vanilla_extract unsalted_butter confectioners_sugar skim_milk brioche_bread heavy_cream
cake_flour white_sugar salt milk vanilla_extract unsalted_butter pumpkin_pie_spice ground_cloves egg_white powdered_sugar ground_cinnamon canola_oil baking_powder temperature
boneless_pork_loin salt pork_loin_chops golden_delicious_apples ground_cinnamon brown_sugar
melted_butter flour salt dry_milk water sugar instan

In [7]:
!cat ../data/processed/recipes_train.txt | wc -l

1039974


In [7]:
import sys
from pathlib import Path
sys.path.append(str(Path(sys.path[0]).parent))
from src.ingredient_tokenizer import load_tokenizer

tokenizer = load_tokenizer('../artifacts')

fill_mask_recipe = pipeline("fill-mask", 
                     model='../artifacts/', 
                     tokenizer=tokenizer,
                     topk=500)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../artifacts/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
fill_mask_recipe('<mask> pepper salt chicken celery onion carrot flour hen')

[{'sequence': '<s> water pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.4776608943939209,
  'token': 13,
  'token_str': 'water'},
 {'sequence': '<s> milk pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.0520767867565155,
  'token': 20,
  'token_str': 'milk'},
 {'sequence': '<s> butter pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.04503853991627693,
  'token': 7,
  'token_str': 'butter'},
 {'sequence': '<s> egg pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.0331474170088768,
  'token': 29,
  'token_str': 'egg'},
 {'sequence': '<s> cold_water pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.020470695570111275,
  'token': 253,
  'token_str': 'cold_water'},
 {'sequence': '<s> eggs pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.019405586645007133,
  'token': 18,
  'token_str': 'eggs'},
 {'sequence': '<s> shortening pepper salt chicken celery onion carrot flo

# Validation

In [10]:
from src.validation import validate
import pandas as pd
ks=[1, 2, 5, 10, 20, 50, 100]
validation_result = validate(fill_mask_recipe.model, tokenizer, 
                             ks=ks,
                             device=-1,
                             limit=5000, relative_base_path='../')

382it [13:42,  2.15s/it]                         


In [11]:
df = pd.DataFrame([validation_result], columns=ks, index=['accuracy'])
(df*100).style.format('{:,.1f}%'.format)

Unnamed: 0,1,2,5,10,20,50,100
accuracy,20.1%,28.2%,39.3%,48.1%,56.9%,68.3%,75.7%


# The Model (Transformer)

[<img src="https://miro.medium.com/max/700/0*ViwaI3Vvbnd-CJSQ.png">](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270)
https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270

In [11]:
model = fill_mask_recipe.model

In [12]:
fill_mask_recipe('dough pizza_sauce <mask>')

[{'sequence': '<s> dough pizza_sauce mozzarella_cheese </s>',
  'score': 0.23033706843852997,
  'token': 156,
  'token_str': 'mozzarella_cheese'},
 {'sequence': '<s> dough pizza_sauce pepperoni </s>',
  'score': 0.20622113347053528,
  'token': 733,
  'token_str': 'pepperoni'},
 {'sequence': '<s> dough pizza_sauce olive_oil </s>',
  'score': 0.0663054808974266,
  'token': 12,
  'token_str': 'olive_oil'},
 {'sequence': '<s> dough pizza_sauce cheese </s>',
  'score': 0.06030401214957237,
  'token': 21,
  'token_str': 'cheese'},
 {'sequence': '<s> dough pizza_sauce parmesan </s>',
  'score': 0.03762682154774666,
  'token': 49,
  'token_str': 'parmesan'},
 {'sequence': '<s> dough pizza_sauce mozzarella </s>',
  'score': 0.03502006083726883,
  'token': 125,
  'token_str': 'mozzarella'},
 {'sequence': '<s> dough pizza_sauce meatballs </s>',
  'score': 0.012741947546601295,
  'token': 1279,
  'token_str': 'meatballs'},
 {'sequence': '<s> dough pizza_sauce pepperoni_slices </s>',
  'score': 0.0

In [13]:
tokenized_input = tokenizer('dough pizza_sauce <mask>', return_tensors='pt')['input_ids']
tokenized_input

tensor([[  0, 431, 856,   4,   2]])

In [14]:
model(tokenized_input)

(tensor([[[ 20.7888,  -5.7311,  -2.3179,  ...,  -1.1549,   1.2096,  -0.6517],
          [ -3.5585, -10.7750,  -3.0984,  ...,  -2.4188,  -3.7403,  -3.7300],
          [ -4.2900, -10.2704,  -1.8199,  ...,  -1.9546,  -2.8219,  -1.6147],
          [ -3.2282, -12.2347,  -5.2322,  ...,  -2.8265,  -3.5606,  -4.4717],
          [ -3.3837,  -7.7216,  17.7582,  ...,  -4.7777,  -3.7840,  -2.4419]]],
        grad_fn=<AddBackward0>),)

In [32]:
x = tokenized_input
x = model.roberta.embeddings(x)
x = model.roberta.encoder(x)
x = model.lm_head(x[0])
x

tensor([[[ 20.7888,  -5.7311,  -2.3179,  ...,  -1.1549,   1.2096,  -0.6517],
         [ -3.5585, -10.7750,  -3.0984,  ...,  -2.4188,  -3.7403,  -3.7300],
         [ -4.2900, -10.2704,  -1.8199,  ...,  -1.9546,  -2.8219,  -1.6147],
         [ -3.2282, -12.2347,  -5.2322,  ...,  -2.8265,  -3.5606,  -4.4717],
         [ -3.3837,  -7.7216,  17.7582,  ...,  -4.7777,  -3.7840,  -2.4419]]],
       grad_fn=<AddBackward0>)

In [36]:
x.shape

torch.Size([1, 5, 3505])

In [81]:
mask_part = x[0, 3, :]
mask_part

tensor([ -3.2282, -12.2347,  -5.2322,  ...,  -2.8265,  -3.5606,  -4.4717],
       grad_fn=<SliceBackward>)

In [49]:
import torch
topk = torch.topk(mask_part, k=5)
topk

torch.return_types.topk(
values=tensor([7.6516, 7.5410, 6.4063, 6.3114, 5.8397], grad_fn=<TopkBackward>),
indices=tensor([156, 733,  12,  21,  49]))

In [54]:
tokenizer.convert_ids_to_tokens(topk.indices)

['mozzarella_cheese', 'pepperoni', 'olive_oil', 'cheese', 'parmesan']