# Masked language models

In [1]:
from transformers import pipeline

fill_mask = pipeline("fill-mask")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
fill_mask("Share your Opinion, Shop & Search to earn real cash rewards.")

# Training on recipes

https://huggingface.co/blog/how-to-train

In [7]:
!head ../data/processed/recipes_train.txt

garlic eggplant flour salt olive_oil seasoned_bread_crumbs grated_parmesan_cheese pepper rounds mozzarella_cheese tomatoes
eggs oreo cookies margarine water sugar_cookie_mix brownie ingredients
nutmeg cinnamon_sticks apple_juice orange_blossom water tea_bags
red_chili pepper_sauce chili_paste olive_oil pork_loin mirin cornstarch oyster_sauce
medium_shrimp dried_red_pepper fillets olive_oil fennel_fronds garlic_cloves bottled_clam_juice dry_white_wine lemon arugula
black_beans salt olive_oil avocado cilantro bell_pepper ground_cumin red_onion lime_juice
dark_chocolate eggs salt vanilla_extract unsalted_butter confectioners_sugar skim_milk brioche_bread heavy_cream
cake_flour white_sugar salt milk vanilla_extract unsalted_butter pumpkin_pie_spice ground_cloves egg_white powdered_sugar ground_cinnamon canola_oil baking_powder temperature
boneless_pork_loin salt pork_loin_chops golden_delicious_apples ground_cinnamon brown_sugar
melted_butter flour salt dry_milk water sugar instan

In [8]:
!cat ../data/processed/recipes_train.txt | wc -l

1039974


In [9]:
import sys
from pathlib import Path
sys.path.append(str(Path(sys.path[0]).parent))
from src.ingredient_tokenizer import load_tokenizer

tokenizer = load_tokenizer('../artifacts')

fill_mask_recipe = pipeline("fill-mask", 
                     model='../artifacts/', 
                     tokenizer=tokenizer,
                     topk=500)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../artifacts/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
fill_mask_recipe('cold_water pepper salt chicken celery onion carrot <mask> hen')

[{'sequence': '<s> cold_water pepper salt chicken celery onion carrot flour hen </s>',
  'score': 0.19535516202449799,
  'token': 10,
  'token_str': 'flour'},
 {'sequence': '<s> cold_water pepper salt chicken celery onion carrot parsley hen </s>',
  'score': 0.08524889498949051,
  'token': 36,
  'token_str': 'parsley'},
 {'sequence': '<s> cold_water pepper salt chicken celery onion carrot eggs hen </s>',
  'score': 0.051332026720047,
  'token': 18,
  'token_str': 'eggs'},
 {'sequence': '<s> cold_water pepper salt chicken celery onion carrot egg_noodles hen </s>',
  'score': 0.050168510526418686,
  'token': 504,
  'token_str': 'egg_noodles'},
 {'sequence': '<s> cold_water pepper salt chicken celery onion carrot carcass hen </s>',
  'score': 0.03739031404256821,
  'token': 2982,
  'token_str': 'carcass'},
 {'sequence': '<s> cold_water pepper salt chicken celery onion carrot bay_leaf hen </s>',
  'score': 0.03536321967840195,
  'token': 217,
  'token_str': 'bay_leaf'},
 {'sequence': '<s> 

In [18]:
fill_mask_recipe('bagel cheddar_cheese <mask>')

[{'sequence': '<s> bagel cheddar_cheese wheat </s>',
  'score': 0.1582866907119751,
  'token': 147,
  'token_str': 'wheat'},
 {'sequence': '<s> bagel cheddar_cheese honey </s>',
  'score': 0.059073228389024734,
  'token': 71,
  'token_str': 'honey'},
 {'sequence': '<s> bagel cheddar_cheese bacon </s>',
  'score': 0.04927658662199974,
  'token': 90,
  'token_str': 'bacon'},
 {'sequence': '<s> bagel cheddar_cheese apple </s>',
  'score': 0.03522687032818794,
  'token': 172,
  'token_str': 'apple'},
 {'sequence': '<s> bagel cheddar_cheese turkey </s>',
  'score': 0.029940197244286537,
  'token': 206,
  'token_str': 'turkey'},
 {'sequence': '<s> bagel cheddar_cheese chips </s>',
  'score': 0.027665911242365837,
  'token': 297,
  'token_str': 'chips'},
 {'sequence': '<s> bagel cheddar_cheese butter </s>',
  'score': 0.027140742167830467,
  'token': 7,
  'token_str': 'butter'},
 {'sequence': '<s> bagel cheddar_cheese mayonnaise </s>',
  'score': 0.02631981298327446,
  'token': 81,
  'token_s

# Validation

In [37]:
from src.validation import validate
import pandas as pd
ks=[1, 2, 5, 10, 20, 50, 100]
validation_result = validate(fill_mask_recipe.model, tokenizer, 
                             ks=ks,
                             device=-1,
                             limit=5000, relative_base_path='../')

38it [01:46,  2.81s/it]                        


In [38]:
df = pd.DataFrame([validation_result], columns=ks, index=['accuracy'])
(df*100).style.format('{:,.1f}%'.format)

Unnamed: 0,1,2,5,10,20,50,100
accuracy,20.8%,29.4%,40.4%,49.3%,57.5%,68.6%,76.0%


In [61]:
df = pd.DataFrame([validation_result], columns=ks, index=['accuracy'])
(df*100).style.format('{:,.1f}%'.format)

Unnamed: 0,1,2,5,10,20,50,100
accuracy,19.6%,27.6%,38.8%,47.9%,56.7%,68.3%,75.7%


In [43]:
df = pd.DataFrame([validation_result], columns=ks, index=['accuracy'])
(df*100).style.format('{:,.1f}%'.format)

Unnamed: 0,1,2,5,10,20,50,100
accuracy,20.1%,27.3%,38.4%,47.5%,56.3%,67.6%,75.6%
