In [97]:
import sys
from pathlib import Path
sys.path.append(str(Path(sys.path[0]).parent))
from src.ingredient_tokenizer import load_tokenizer
from transformers import pipeline
from transformers import RobertaForMaskedLM
import numpy as np
tokenizer = load_tokenizer('../artifacts')

# model = RobertaForMaskedLM.from_pretrained('../artifacts', output_attentions=True)
model = RobertaForMaskedLM.from_pretrained('../artifacts')

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../artifacts and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def show_head_view(model, tokenizer, text):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    attention = model(input_ids)[-1]
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    head_view(attention, tokens)

In [11]:
def get_attention(texts, model, tokenizer):
    inputs = tokenizer.batch_encode_plus(texts, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    attention = model(input_ids)[-1]
    return np.stack([a.detach().numpy() for a in attention])

In [100]:
texts = ["salt sugar baking_soda lemon_juice flour egg shortening",
         "salt sugar baking_soda flour molasses egg shortening",
         "salt sugar baking_soda flour egg brown_sugar shortening",
         "salt sugar baking_soda flour egg shortening buttermilk",
        ]
attention = get_attention(texts, model, tokenizer)

In [34]:
tokenized_input = tokenizer('salt sugar baking_soda lemon_juice flour egg shortening', return_tensors='pt')['input_ids']
baking_sugar_vector = model(tokenized_input)[0][0, 1, :]

In [44]:
tokenized_input = tokenizer('baking_soda sugar flour salt cookies ground_ginger ground_cloves molasses ground_cinnamon egg shortening', return_tensors='pt')['input_ids']
baking_sugar_vector_2 = model(tokenized_input)[0][0, 1, :]

In [45]:
tokenized_input = tokenizer('sugar mustard kidney_beans hamburger_buns sharp_white_cheddar_cheese water vegetable_oil gruyere_cheese sea_salt onions quinoa garlic butter ketchup chipotle_powder lettuce tomatoes freshly_ground_pepper mayonnaise cloves', return_tensors='pt')['input_ids']
cooking_sugar_vector = model(tokenized_input)[0][0, 1, :]

In [50]:
tokenized_input = tokenizer('sugar fish_sauce lime peas carrot salted_peanuts star_anise chile rice_wine_vinegar thai_basil chicken_stock garlic rice_stick_noodles cinnamon ginger scallions boneless_skinless_chicken_breast_halves canola_oil cloves mung_bean_sprouts', return_tensors='pt')['input_ids']
cooking_sugar_vector_2 = model(tokenized_input)[0][0, 1, :]

In [51]:
import torch

In [59]:
torch.cosine_similarity(baking_sugar_vector, baking_sugar_vector_2, dim=-1)

tensor(0.8224, grad_fn=<DivBackward0>)

In [60]:
torch.cosine_similarity(cooking_sugar_vector, cooking_sugar_vector_2, dim=-1)

tensor(0.8807, grad_fn=<DivBackward0>)

In [61]:
torch.cosine_similarity(baking_sugar_vector, cooking_sugar_vector, dim=-1)

tensor(0.7721, grad_fn=<DivBackward0>)

In [62]:
torch.cosine_similarity(baking_sugar_vector_2, cooking_sugar_vector, dim=-1)

tensor(0.6976, grad_fn=<DivBackward0>)

In [63]:
torch.cosine_similarity(baking_sugar_vector_2, cooking_sugar_vector_2, dim=-1)

tensor(0.7152, grad_fn=<DivBackward0>)

In [58]:
torch.cosine_similarity(cooking_sugar_vector, cooking_sugar_vector_2, dim=-1)

tensor(0.8807, grad_fn=<DivBackward0>)

In [84]:
vectors = []
recipes = []
with open('../data/processed/recipes_val.txt') as f:
    for recipe in f.readlines():
        ings = recipe.split(' ')
        if 'sugar' in ings:
            tokenized_input = tokenizer(' '.join(ings), return_tensors='pt')['input_ids']
            vector = model(tokenized_input)[0][0, ings.index('sugar')+1, :]
            
            vectors.append(vector.detach().numpy())
            recipes.append(recipe)
        
        if len(vectors) > 500:
            break

In [85]:
from sklearn.cluster import KMeans

In [86]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(vectors)
clusters = kmeans.predict(vectors)

In [94]:
vectors[0].shape

(3505,)

In [103]:
np.array(recipes)[np.argwhere(clusters == 0)][:10]

array([['flour salt vanilla_extract unsalted_butter cold_water sugar egg_yolk\n'],
       ['butter cream_cheese flour milk vanilla_extract margarine cherry_pie_filling cold_water cocoa sugar egg\n'],
       ['egg_yolks eggs salt milk cream sugar ground_cinnamon\n'],
       ['eggs flour salt milk zest unsalted_butter lemons sugar lemon_juice baking_powder\n'],
       ['vodka sugar goose lemon_juice triple_sec\n'],
       ['butter cooked_bacon eggs flour salt vanilla pecans unsweetened_cocoa_powder sugar bittersweet_chocolate\n'],
       ['cranberries syrup jello cream_cheese celery pineapple raspberry water grapes marshmallow_cream sugar relish heavy_whipping_cream dressing\n'],
       ['flour salt milk vanilla_extract unsalted_butter grated_nutmeg sugar active_dry_yeast egg_yolk\n'],
       ['butter nuts nutmeg flour salt eggs cinnamon water sugar cloves raisins pumpkin baking_soda\n'],
       ['butter nuts flour salt vanilla almond_extract sugar coconut egg\n']],
      dtype='<U309')

In [104]:
np.array(recipes)[np.argwhere(clusters == 1)][:10]

array([['green_onions chicken_broth water white_wine_vinegar wild_rice vegetable_oil sugar curry_powder garlic_minced scallions shrimp\n'],
       ['salt onion garlic_cloves ancho pepper water bacon_grease sugar chili_powder tomatoes\n'],
       ['salt tomato_paste pizza_sauce pepper sugar spaghetti\n'],
       ['baking_soda jalapenos eggs salt buttermilk flour garlic_cloves honey sweet_onion egg_white corn_kernels sugar lime_juice chili_powder baking_powder frozen_corn yellow_cornmeal\n'],
       ['sugar yellow_mustard celery_salt pepper\n'],
       ['parsley salt olive_oil pepper dijon_mustard vegetable_oil red_wine_vinegar sugar shallots\n'],
       ['chili_oil garnish ginger eggs scallions black_vinegar ground_black_pepper cornstarch toasted_sesame_oil mushrooms soy_sauce bamboo_shoots sugar vegetable_oil firm_tofu garlic_minced vegetable_stock\n'],
       ['salt cooking_spray avocado chipotle_chile_powder corn_tortillas tuna lime_wedges sour_cream sugar cilantro_leaves chili_powde