# Dependencies

Set directory of files

In [1]:
directory = '/Users/hisham/Google Drive/Recipes1M'

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
import json
import requests
import string
from collections import defaultdict
import spacy
import regex as re
import numpy as np
from nltk import download
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
download('wordnet')
download('stopwords')

[nltk_data] Downloading package wordnet to /Users/hisham/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hisham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
!pip install gensim==4.0.1



In [4]:
!pip install -U numpy

Requirement already up-to-date: numpy in /Users/hisham/opt/anaconda3/lib/python3.8/site-packages (1.21.2)


# Parsing Recipe1M to ingredient list dataset

## Loading Recipe1M

In [None]:
json_filename = f'{directory}/layer1.json'
json_file = open(json_filename , 'r')
data = json.load(json_file)
json_file.close()

## Generating food names and synonym dictionaries
Here we extract the names from the KB, normalise them, and then generate a dictionary of all known recipes. We also generate a synonym dict that maps every synonym to its main ingredient name (we'll make use of this when we parse ingredients from Recipe1M)

In [5]:
lemmatizer = WordNetLemmatizer()

def normalise_ingredient(name):
  if type(name) is not str:
    return name
  name = name.lower()
  name = name.replace('-', ' ')
  # remove parenthesised items
  name = re.sub(r'\(.*\)', "", name)
  name = [lemmatizer.lemmatize(word) for word in name.split()]

  return "_".join(name)

In [6]:
ids = requests.get('https://ecarekb.schlegel-online.de/foodon_ids').json()
food_names = set([normalise_ingredient(ing['ingredient']) for ing in ids])
list(food_names)[:5]

['green_bean', 'chorizo', 'blackcurrant', 'cornmeal', 'truffle']

In [7]:
synonyms = dict()
for ing in ids:
  name = normalise_ingredient(ing['ingredient'])
  syns = [
          normalise_ingredient(word)
          for word in ing['alternate_names']
          if normalise_ingredient(word) != name
  ]
  for word in syns:
    synonyms[word] = name

Save them

In [None]:
with open(f'{directory}/synonyms.json', 'w') as f:
  json.dump(synonyms, f)

In [None]:
with open(f'{directory}/food_names.json', 'w') as f:
  json.dump(list(food_names), f)

## Ingredient/instruction filtering functions

In [8]:
all_names = food_names.union(synonyms.keys())

In [9]:
def get_name(ing):
  if ing in food_names:
    return ing
  return synonyms[ing]

In [10]:
def filter_ingredient(ing):
  """
  Takes in a string and removes words that aren't defined ingredients
  """
  ing = ing.lower()

  ing = ing.replace('-', ' ')
  ing = ing.replace(',', ' ')
  ing = ing.replace('/', ' ')

  # remove punctuation except parentheses and dashes
  ing = ing.translate(str.maketrans('', '', string.punctuation.replace('()', "")))

  # remove parenthesised items
  ing = re.sub(r'\(.*\)', "", ing)

  # remove fractions
  ing = re.sub(r'\d/\d', "", ing)

  # remove digits
  ing = re.sub(r'\d', "", ing)

  # lemmatize words
  words = [lemmatizer.lemmatize(word) for word in ing.split()]

  # the following loop ensures multi-word ingredient names
  # are included without including the subwords
  ing = ''
  i = 0
  while i < len(words) - 2:
    if f'{words[i]}_{words[i+1]}_{words[i+2]}' in all_names:
      ing += get_name(f'{words[i]}_{words[i+1]}_{words[i+2]}') + ' '
      i += 2
    elif f'{words[i]}_{words[i+1]}' in all_names:
      ing += get_name(f'{words[i]}_{words[i+1]}') + ' '
      i += 1
    elif f'{words[i+1]}_{words[i]}' in all_names:
      ing += get_name(f'{words[i+1]}_{words[i]}') + ' '
      i += 1
    elif words[i] in all_names:
      ing += get_name(words[i]) + " "
    i += 1
  # if there are 2 remaining words
  if i == len(words) - 2:
    if f'{words[i]}_{words[i+1]}' in all_names:
      ing += get_name(f'{words[i]}_{words[i+1]}')
    elif f'{words[i+1]}_{words[i]}' in all_names:
      ing += get_name(f'{words[i+1]}_{words[i]}')
    else:
      if words[i] in all_names:
        ing += get_name(words[i]) + ' '
      if words[i+1] in all_names:
        ing += get_name(words[i+1])

  
  # if there's 1 remaining word
  if i == len(words) - 1:
    if words[i] in all_names:
      ing += get_name(words[i])

  
  return " ".join(ing.split())

In [11]:
filter_ingredient('black pepper')

'black_pepper'

In [12]:
def filter_instruction(ins):
  """
  Takes in a string and normalises the ingredients without removing other words
  """
  ins = ins.lower()

  ins = ins.replace('-', ' ')
  ins = ins.replace(',', ' ')
  ins = ins.replace('/', ' ')

  # remove punctuation
  ins = ins.translate(str.maketrans('', '', string.punctuation))

  # remove digits
  ins = re.sub(r'\d', "", ins)

  # lemmatize words and remove stopwords
  sw = set(stopwords.words('english'))
  words = [lemmatizer.lemmatize(word) for word in ins.split() if word not in sw]

  ins = ''
  i = 0
  while i < len(words) - 2:
    if f'{words[i]}_{words[i+1]}_{words[i+2]}' in all_names:
      ins += get_name(f'{words[i]}_{words[i+1]}_{words[i+2]}') + ' '
      i += 2
    elif f'{words[i]}_{words[i+1]}' in all_names:
      ins += get_name(f'{words[i]}_{words[i+1]}') + ' '
      i += 1
    elif f'{words[i+1]}_{words[i]}' in all_names:
      ins += get_name(f'{words[i+1]}_{words[i]}') + ' '
      i += 1
    elif words[i] in all_names:
      ins += get_name(words[i]) + " "
    else:
      ins += words[i] + " "
    i += 1

  # if there are 2 remaining words
  if i == len(words) - 2:
    if f'{words[i]}_{words[i+1]}' in all_names:
      ins += get_name(f'{words[i]}_{words[i+1]}')
    elif f'{words[i+1]}_{words[i]}' in all_names:
      ins += get_name(f'{words[i+1]}_{words[i]}')
    elif f'{words[i+1]}_{words[i]}' in all_names:
      ins += get_name(f'{words[i+1]}_{words[i]}')
    else:
      ins += " ".join(words[-2:])
  
  # if there's a remaining word
  if i == len(words) - 1:
    if words[i] in all_names:
      ins += get_name(words[i])

  return " ".join(ins.split())

## Generating training corpus
Each document having the format:

"ing_1 ing_2 ing_3"

and if with instructions:

"ing_1 ing_2 ing_3 @@ inst_1 || inst_2 || inst_3"



In [None]:
def to_recipe_string_list(recipes, with_instructions=False):
  """
  Generator that takes in Recipe1M format recipes and returns the normalised
  string representation
  """
  for recipe in recipes:
    recipe_ings = []
    for ing in recipe['ingredients']:
      filtered_ing = filter_ingredient(ing['text'])
      if filtered_ing:
        recipe_ings.append(filtered_ing)
    ing_string = " ".join(recipe_ings)
    if with_instructions:
      recipe_insts = []
      for inst in recipe['instructions']:
        recipe_insts.append(filter_instruction(inst['text']))
        instructions_string = " || ".join(recipe_insts)
      yield ing_string + " @@ " + instructions_string
    else:
      yield ing_string

Save new dataset

In [None]:
with open(f'{directory}/recipes_ingredients_and_instructions.txt', 'a') as f1:
  with open(f'{directory}/recipes_ingredients_only.txt', 'a') as f2:
    f1.seek(0)
    f1.truncate()
    f2.seek(0)
    f2.truncate()
    for rec in to_recipe_string_list(data, with_instructions=True):
      ingredients = rec.split('@@')[0].strip()
      f1.write(rec + '\n')
      f2.write(ingredients + '\n')

# DIISH Model
This section implements the [DIISH heuristic](https://www.frontiersin.org/articles/10.3389/frai.2020.621766/full) for ingredient substitution

## Dependencies

In [13]:
import spacy
import numpy as np

from scipy.spatial import distance

from gensim.corpora import Dictionary
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.test.utils import datapath
from nltk import download
from nltk.stem import WordNetLemmatizer
download('wordnet')

from itertools import combinations
from collections import defaultdict
from math import ceil
import json
import string

[nltk_data] Downloading package wordnet to /Users/hisham/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# will ask you to restart your runtime the first time
!pip install spacy --upgrade

      Successfully uninstalled catalogue-1.0.0
  Attempting uninstall: srsly
    Found existing installation: srsly 1.0.5
    Uninstalling srsly-1.0.5:
      Successfully uninstalled srsly-1.0.5
  Attempting uninstall: thinc
    Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfully uninstalled thinc-7.4.0
  Attempting uninstall: spacy
    Found existing installation: spacy 2.2.4
    Uninstalling spacy-2.2.4:
      Successfully uninstalled spacy-2.2.4
Successfully installed catalogue-2.0.6 pathy-0.6.0 pydantic-1.8.2 spacy-3.1.2 spacy-legacy-3.0.8 srsly-2.4.1 thinc-8.0.10 typer-0.3.2


In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!pip install mean_average_precision

In [None]:
!pip install ml-metrics

## Word2Vec Model
Trained on ingredients concatenated with instructions

In [None]:
sentences = LineSentence(datapath(f'{directory}/recipes_ingredients_and_instructions.txt'))
model = Word2Vec(sentences=sentences)

In [None]:
model.save(f'{directory}/word2vec.model')

In [None]:
model = Word2Vec.load(f'{directory}/word2vec.model')

### W Score

In [None]:
def W(a, b):
  if a not in model.wv.vocab or b not in model.wv.vocab:
    return 0
  return model.wv.similarity(a, b)

In [None]:
W('bay_leaf', 'thyme')

0.69178545

## Dictionary

In [None]:
def generate_dict():
  with open(f'{directory}/recipes_ingredients_only.txt', 'r') as f:
    recipes = f.readlines()

  recipes = [line.split() for line in recipes]

  return Dictionary(documents=recipes)

In [None]:
def save_dict(d):
  d.save_as_text(f'{directory}/ingredient_dictionary_2.txt')

In [None]:
dictionary = generate_dict()
save_dict(dictionary)

In [14]:
dictionary = Dictionary.load_from_text(f'{directory}/dictionary.txt')

## D Score

### Saving co-occurrence vectors

In [None]:
def generate_cooccurrence_vectors(path=f'{directory}/co-occurrence_vectors'):
  '''
  Saves a file for each word containing the co-occurrence vector.
  (used to speed up D execution)
  '''
  with open(f'{directory}/recipes_ingredients_only.txt', 'r') as f:
    for word in dictionary.itervalues():
      vector = np.zeros(len(dictionary))
      recipe_count = 0

      for i, line in enumerate(f):
        ings = line.split()
        if word in ings:
          recipe_count += 1
          for ing in ings:
            vector[dictionary.token2id[ing]] += 1

      if recipe_count != 0:
        vector = vector/recipe_count

      np.savetxt(f'{path}/{word}.npy', vector)
      f.seek(0)

In [None]:
def get_cooccurrence_matrix(path=f'{directory}/co-occurrence_vectors'):
  '''
  Loads all co-occurrence vectors into memory for easy access
  '''
  matrix = []
  for i in range(len(dictionary)):
    vector = np.loadtxt(f'{path}/{dictionary[i]}.npy')
    matrix.append(vector)
  return np.array(matrix)

In [None]:
generate_cooccurrence_vectors()

In [None]:
np.savetxt(f'{directory}/cooccurrence_matrix.npy', get_cooccurrence_matrix())

In [None]:
co_occ = np.loadtxt(f'{directory}/cooccurrence_matrix.npy')

In [None]:
def D(a, b, matrix=co_occ):
  '''
  Can be used to calculate D score if matrix hasn't been loaded yet by setting
  the matrix parameter to None.
  '''
  if not matrix is None:
    a_vector = co_occ[dictionary.token2id[a]]
    b_vector = co_occ[dictionary.token2id[b]]
  
  else:
    a_vector, b_vector = np.zeros(len(dictionary)), np.zeros(len(dictionary))
    a_recipe_count, b_recipe_count = 0, 0
    with open(f'{directory}/recipes_ingredients_only.txt', 'r') as f:
      for i, line in enumerate(f):
        ings = line.split()
        if a in ings:
          a_recipe_count += 1
          for ing in ings:
            a_vector[dictionary.token2id[ing]] += 1

        if b in ings:
          b_recipe_count += 1
          for ing in ings:
            b_vector[dictionary.token2id[ing]] += 1

    if a_recipe_count != 0:
      a_vector = a_vector/a_recipe_count

    if b_recipe_count != 0:
      b_vector = b_vector/b_recipe_count


  if np.count_nonzero(a_vector) == 0 or np.count_nonzero(b_vector) == 0:
    return 0

  

  return 1 - distance.cosine(a_vector, b_vector)

In [None]:
D('tuna', 'salmon')

0.6602318567679469

## P Score

### Generating context count vectors

#### Fc

In [None]:
def get_fc():
  fc = np.zeros((len(dictionary), len(dictionary)))
  with open(f'{directory}/recipes_ingredients_only.txt', 'r') as f:
    for line in f:
      ings = line.split()
      for a, b in combinations(ings, 2):
        fc[dictionary.token2id[a]][dictionary.token2id[b]] += 1
  return fc

In [None]:
fc = get_fc()

In [None]:
np.savetxt(f'{directory}/context_counts.npy', fc)

#### Fic

In [None]:
def generate_context_vectors(path=f'{directory}/ingredient_context_counts'):
  if not os.path.exists(path):
		os.mkdir(path)
  with open(f'{directory}/recipes_ingredients_only.txt', 'r') as f:
    for word in dictionary.itervalues():
      fic = np.zeros((len(dictionary), len(dictionary)))
      for num, line in enumerate(f):
        ings = line.split()
        if word in ings:
          for context in combinations(ings, 2):
            index = (dictionary.token2id[context[0]],dictionary.token2id[context[1]])
            fic[index] += 1
      np.savetxt(f'{path}/{word}.npy', fic)
      f.seek(0)

In [None]:
def get_fic_matrix(path=f'{directory}/ingredient_context_counts'):
  matrix = []
  for i in range(len(dictionary)):
    m = np.loadtxt(f'{path}/{dictionary[i]}.npy')
    matrix.append(m)
  return np.array(matrix)

In [None]:
generate_context_vectors()

In [None]:
fic = get_fic_matrix()

In [None]:
np.save(f'{directory}/fic_matrix.npy', fic)

### PPMI and P function definitions

In [None]:
def PPMI(fic, fi, fc):
  b = fi * fc
  return np.maximum(np.log10(np.divide(fic * len(dictionary) * len(fc), b, out=np.zeros(fic.shape, dtype=float), where=b!=0))
  * np.sqrt(np.maximum(fi, fc)), np.zeros(len(fc)))

In [None]:
fc = np.loadtxt(f'{directory}/context_counts.npy')
fic = np.load(f'{directory}/fic_matrix.npy')
def P(a, b):

  fic_a = fic[dictionary.token2id[a]]
  fic_b = fic[dictionary.token2id[b]]

  fi_a = dictionary.dfs[dictionary.token2id[a]]
  fi_b = dictionary.dfs[dictionary.token2id[b]]

  ppmi = PPMI(fic_a.flatten(), fi_a, fc.flatten()), PPMI(fic_b.flatten(), fi_b, fc.flatten())

  if np.count_nonzero(ppmi[0]) == 0 or np.count_nonzero(ppmi[1]) == 0:
    return 0
  
  return 1 - distance.cosine(ppmi[0], ppmi[1])

In [None]:
P('tuna', 'salmon')

  This is separate from the ipykernel package so we can avoid doing imports until


0.5856681025085629

In [None]:
P('aubergine', 'egg')

  This is separate from the ipykernel package so we can avoid doing imports until


0.26148481301231197

## Spacy model

In [None]:
nlp = spacy.load('en_core_web_lg')

### S score

In [None]:
nlps = {}
for ing in dictionary.token2id:
  ing = ing.replace('_', ' ')
  nlps[ing] = nlp(ing)

def S(a, b):
  # for multi-word ingredients
  a = a.replace('_', ' ')
  b = b.replace('_', ' ')
  return nlps[a].similarity(nlps[b])

In [None]:
S('vegetable_oil', 'butter')

0.6643049684318862

## DIISH Score

In [None]:
def DIISH(a, b):
  return W(a, b) + (S(a, b) ** 2) + (0.5 * D(a, b) ** 0.25) + (2 * P(a, b) ** 0.5)

In [None]:
DIISH('lard', 'butter')

  This is separate from the ipykernel package so we can avoid doing imports until


2.5720430224932502

In [None]:
DIISH('tuna', 'coconut')

  This is separate from the ipykernel package so we can avoid doing imports until


1.719364898570073

## Generating DIISH Matrix

In [None]:
def generate_DIISH_matrix(path=f'{directory}/DIISH_scores'):
  DIISH_matrix = np.zeros((len(dictionary), len(dictionary)))
  for i in range(len(dictionary)):
    for j in range(len(dictionary)):
      DIISH_matrix[i][j] = DIISH(dictionary[i], dictionary[j])
    np.savetxt(f'{path}/{dictionary[i]}.npy', DIISH_matrix[i])
    print(f'Word {i} done')
  return DIISH_matrix

In [None]:
DIISH_matrix = generate_DIISH_matrix()

In [None]:
np.savetxt(f'{directory}/DIISH_matrix.npy', DIISH_matrix)

## Getting Top Substitution Candidates

In [15]:
DIISH_matrix = np.loadtxt(f'{directory}/DIISH_matrix.npy')
def get_top_candidates(a, k=10):
  scores = DIISH_matrix[dictionary.token2id[a]]
  scores = [(dictionary[i], score) for i, score in enumerate(scores) if score == score]
  return sorted(scores, key=lambda x: x[1], reverse=True)[1:k+1]

In [16]:
get_top_candidates('milk', k=10)

[('evaporated_milk', 3.3692197987602963),
 ('butter', 3.217424772732188),
 ('egg', 3.179688533087118),
 ('flour', 3.150331974983061),
 ('cream', 3.1492243696733864),
 ('sugar', 3.1416614889123604),
 ('soured_cream', 3.088015758484327),
 ('double_cream', 3.067865237519814),
 ('buttermilk', 3.0621373331801607),
 ('margarine', 2.972302804994131)]

## Evaluation
Using data from [The Cook's Thesaurus](http://foodsubs.com/) scraped using https://github.com/solashirai/FoodSubstitutionDataScripts

In [4]:
with open(f'{directory}/scraped_thesaurus_substitutions.json', 'r') as f:
  data = json.load(f)

In [5]:
data[:5]

[['0',
  'amaranth\xa0= amaranth seeds',
  'millet OR quinoa OR buckwheat groats',
  'scraped_pages/subs_www.foodsubs.com_Grainoth.html.html'],
 ['1',
  'black quinoa',
  'quinoa',
  'scraped_pages/subs_www.foodsubs.com_Grainoth.html.html'],
 ['2',
  'millet',
  'quinoa OR bulgur OR       couscous',
  'scraped_pages/subs_www.foodsubs.com_Grainoth.html.html'],
 ['3',
  'psyllium seed husks = PSH = plantago seed husks       = flea seed',
  'oat       bran',
  'scraped_pages/subs_www.foodsubs.com_Grainoth.html.html'],
 ['4',
  'quinoa = hie',
  'couscous OR rice OR bulgur OR millet OR buckwheat groats OR amaranth',
  'scraped_pages/subs_www.foodsubs.com_Grainoth.html.html']]

In [12]:
lemmatizer = WordNetLemmatizer()

def filter_scraped_ingredient(ing):
  ing = filter_ingredient(ing)

  # it has to be just one ingredient (or else it means it's a multi-word
  # ingredient that isn't in our dictionary)
  if len(ing.split()) != 1:
    return ''

  return ing

In [13]:
substitutions = defaultdict(set)

for sub in data:
  sub_from = [filter_scraped_ingredient(word) for word in sub[1].strip().split('=')]
  sub_from = set([ing for ing in sub_from if ing != ''])

  # if ingredients are undefined in our dictionary, go on to next substitution
  if not sub_from:
    continue

  sub_to = [filter_scraped_ingredient(word) for word in sub[2].strip().split('OR')]
  sub_to = set([ing for ing in sub_to if ing != ''])

  if not sub_to:
    continue
  
  for ing in sub_from:
    substitutions[ing] = substitutions[ing].union(sub_to) - set([ing])


In [14]:
for ing in list(substitutions.keys())[:10]:
  print(f'{ing}: {substitutions[ing]}')

amaranth: {'spinach', 'quinoa', 'buckwheat'}
quinoa: {'buckwheat', 'couscous', 'rice', 'oat', 'bulgur_wheat', 'amaranth'}
cheese: {'mozzarella', 'parmesan', 'yeast', 'cheddar', 'stilton', 'camembert', 'tofu', 'gouda', 'cottage_cheese', 'port_salut', 'monterey_jack', 'jackfruit', 'port', 'brie', 'ricotta', 'feta'}
gouda: {'jackfruit', 'edam'}
halloumi: {'mozzarella', 'feta'}
mozzarella: {'cheddar', 'tofu', 'cheese'}
port: {'beef', 'wine', 'jackfruit', 'vermouth'}
port_salut: {'jackfruit'}
scallop: {'skate', 'shrimp', 'crab', 'sole', 'flounder', 'squash', 'acorn_squash', 'lobster', 'monkfish', 'shark', 'cod'}
crayfish: {'lobster', 'shrimp', 'langoustine', 'crab'}


In [None]:
get_top_candidates('halloumi')

[('ricotta', 2.2897158383543754),
 ('guava', 2.15571825242428),
 ('asparagus', 2.1311106736682177),
 ('cheddar', 1.9967341464754786),
 ('dressing', nan),
 ('pimento', nan),
 ('salad', nan),
 ('melon', 1.9739713028076609),
 ('corn_oil', 1.9608168077126802),
 ('pomegranate', 1.9390146108687403)]

In [None]:
'''
  threshold: a success is whenever the model predicts at least
    threshold% of the possible substitutions

  k: testing top k candidates from the model
'''
def get_map(threshold=0.5, k=5):
  success_count = 0
  success_threshold_length = ceil(threshold * k)
  failures = dict()
  for ing, subs in substitutions.items():
    candidates = set([c[0] for c in get_top_candidates(ing, k=k)])
    matches = subs.intersection(candidates)
    if len(matches) >= success_threshold_length \
        or len(matches) == len(subs):
      success_count += 1
    else:
      failures[ing] = subs
  accuracy = success_count / len(substitutions)
  return accuracy, failures

In [None]:
get_map()[0]

0.23115577889447236

In [None]:
# looking at failures
get_map()[1]

# Vectorizers

## TFIDF Vectorizer

In [36]:
import os
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full
from gensim.models import TfidfModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors
import numpy as np


class TFIDFVectorizer(BaseEstimator, TransformerMixin):

  def __init__(self, dict_path=None, model_path=None):
    self.dict_path = dict_path
    self.model_path = model_path
    self.id2word = None
    self.tfidf = None
    self.load()

  def load(self):
    if self.dict_path != None and os.path.exists(self.dict_path):
      self.id2word = Dictionary.load_from_text(self.dict_path)
    if self.model_path != None and os.path.exists(self.model_path):
      self.tfidf = TfidfModel.load(self.model_path)

  def save(self):
    if self.dict_path != None:
      self.id2word.save_as_text(self.dict_path)
    if self.model_path != None:
      self.tfidf.save(self.model_path)

  def fit(self, documents, labels=None):
    self.id2word = Dictionary(documents)
    # filter ingredients that occur less than 5 times or in more than 70% of the
    # recipes, then keep only the 1500 most frequent ingredients
    # self.id2word.filter_extremes(no_below=5, no_above=0.8, keep_n=400)
    self.tfidf = TfidfModel(dictionary=self.id2word, normalize=True)
    self.save()
    return self

  def transform(self, documents):
    for document in documents:
      docvec = self.tfidf[self.id2word.doc2bow(document)]
      yield sparse2full(docvec, len(self.id2word))


## Doc2Vec Vectorizer

In [17]:
import os
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors
import numpy as np
import time


class Doc2VecVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, path=None):
        self.path = path
        self.model = None
        self.load()

    def load(self):
        if self.path != None:
            self.model = Doc2Vec.load(self.path)

    def save(self, path):
        self.model.save(path)

    def fit(self,
            documents=None,
            corpus_file=None,
            vector_size=600,
            min_count=5,
            seed=1,
            workers=2):
        if corpus_file is None:
            corpus = [
                TaggedDocument(words, [idx])
                for idx, words in enumerate(documents)
            ]
            self.model = Doc2Vec(corpus,
                                 vector_size=vector_size,
                                 min_count=min_count,
                                 seed=seed,
                                 workers=workers)
        else:
            self.model = Doc2Vec(corpus_file=corpus_file,
                                 vector_size=vector_size,
                                 min_count=min_count,
                                 seed=seed,
                                 workers=workers)
        return self

    def transform(self, documents):
        for document in documents:
            yield self.model.infer_vector(document)


## Training

### Training TF-IDF model

Train TF-IDF transformer on the filtered dataset (ingredients only)

In [24]:
filename = f'{directory}/recipes_ingredients_only.txt'
with open(filename, 'r') as f:
  data = f.readlines()

Tokenize every recipe before passing it to TFIDFVectorizer

In [17]:
def tokenize(recipes):
  for recipe in recipes:
    yield recipe.split()

In [18]:
tokenized = list(tokenize(data))

In [37]:
tfidf = TFIDFVectorizer(
    model_path=f'{directory}/tfidf_model_ingredients_only',
    dict_path=f'{directory}/dictionary.txt'
    )

# tfidf.fit(tokenized)

In [None]:
# tfidf.tfidf.save(f'{directory}/tfidf_model_ingredients_only')

Generate the vectors for each recipe in Recipe1M

In [16]:
tfidf_vecs = list(tfidf.transform(tokenized))

Save the vectors

In [None]:
np.savetxt(f'{directory}/tfidf_vectors_ingredients_only.gz', tfidf_vecs)

In [38]:
tfidf_vecs = np.loadtxt(f'{directory}/tfidf_vectors_ingredients_only.gz')

### Training Doc2Vec model

In [21]:
doc2vec = Doc2VecVectorizer().fit(corpus_file=f'{directory}/recipes_ingredients_and_instructions.txt')

In [30]:
doc2vec = Doc2VecVectorizer(f'{directory}/doc2vec_model_ingredients_and_instructions.model')

In [22]:
doc2vec.save(f'{directory}/doc2vec_model_ingredients_and_instructions.model')

Generate recipe vectors

In [25]:
doc2vec_vecs = []
with open(f'{directory}/recipes_ingredients_and_instructions.txt', 'r') as f:
  for _, line in enumerate(f):
    line = line.split()
    doc2vec_vecs.append(doc2vec.model.infer_vector(line))

In [None]:
np.savetxt(f'{directory}/doc2vec_vectors_ingredients_and_instructions.gz', doc2vec_vecs)

In [31]:
doc2vec_vecs = np.loadtxt(f'{directory}/doc2vec_vectors_ingredients_and_instructions.gz')

In [None]:
len(doc2vec_vecs)

# Recipe Similarity

In [20]:
def split_array_ranges(length, k):
  """
  Takes in a length of a list and returns a list of index tuples covering k chunks
  """
  chunks = []
  step = int(length/k)
  start_ind = 0
  end_ind = step
  while end_ind < length:
    chunks.append((start_ind, end_ind))
    start_ind = end_ind
    end_ind += step
  chunks.append((start_ind, length))
  return chunks

def get_most_similar(docvec, docvecs, k = 20, n_clusters = 10):
  """
  Gets the k most similar recipes

  Parameters:
    docvec: the TF-IDF vector of the queried recipe
    docvecs: the TF-IDF vectors of the corpus
    k
    n_clusters: how many times to split the data

  Returns
    a k-length list of (index, distance) tuples sorted by distance
  """
  # cut data to n_clusters number of clusters
  similar_recipes = []
  for start, end in split_array_ranges(len(docvecs), n_clusters):
    if end - start < k:
      break
    nbrs = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='auto').fit(docvecs[start:end])
    distances, indicies = nbrs.kneighbors([docvec])
    indicies = list(map(lambda x: x+start, indicies))
    for x in zip(indicies[0], distances[0]):
      similar_recipes.append(x)
  return sorted(similar_recipes, key=lambda x: x[1])[:k]

# import itertools

# SIZE = 1029720

# def get_most_similar_gen(docvec, docvecs, k = 20, n_clusters = 10):
#   # cut data to n_clusters number of clusters
#   similar_recipes = []
#   for start, end in split_array_ranges(SIZE, n_clusters):
#     if end - start < k:
#       break
#     vectors = list(itertools.islice(docvecs, end - start))
#     nbrs = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='auto').fit(vectors)
#     distances, indicies = nbrs.kneighbors([docvec])
#     indicies = list(map(lambda x: x+start, indicies))
#     for x in zip(indicies[0], distances[0]):
#       similar_recipes.append(x)
#   return sorted(similar_recipes, key=lambda x: x[1])[:k]

# Ingredient Substitution

## GHG dictionary loading
Load in the ingredient carbon data from the API into a dictionary, taking into account alternate ingredient names.

In [21]:
ghg_dict = defaultdict(float)
ids = requests.get('https://ecarekb.schlegel-online.de/foodon_ids').json()
for ing in ids:
  name = filter_ingredient(ing['ingredient'])
  if name:
    req = requests.get(f'https://ecarekb.schlegel-online.de/ingredient?ingredient={"+".join(ing["ingredient"].split())}')
    ghg = req.json()['ghg']
    ghg_dict[name] = ghg
  else:
    continue
  for alt_name in ing['alternate_names']:
    alt_name = filter_ingredient(alt_name)
    if alt_name and alt_name not in ghg_dict:
      ghg_dict[alt_name] = ghg


## Final substitution function

In [26]:
def get_substitutions(ingredients: [str],
                      model,
                      vecs,
                      instructions: [str]=[],
                      verbose=False):
  """
  Parameters:
    ingredients: a list of ingredient strings
    model: vectorizer to use to get the recipe vector
    instructions: a list of instruction strings (optional)

  Returns:
    a list of tuples in the format of
    (original_ingredient, substitution, confidence)
    sorted by confidence
  """

  # filter ingredients and instructions and then tokenize them
  ingredients = [filter_ingredient(ing) for ing in ingredients]
  instructions = filter_instruction(" || ".join(instructions)).split()

  # concatenate the two using @@ if there are instructions
  if instructions:
    recipe = ingredients + ['@@'] + instructions
  else:
    recipe = ingredients

  # get recipe vector
  recipe_vec = next(model.transform([recipe]))

  # get the most similar recipes
  similar_recipes = get_most_similar(recipe_vec, vecs)
  if verbose:
    print('Similar recipes (index, confidence): ', similar_recipes)

  # uncomment the next line out if data is NOT loaded
  # load the recipes' ingredients and tokenize them
  # recipes_ind = [x[0] for x in similar_recipes]
  # recipes = []
  # with open(f'{directory}/recipes_ingredients_only.txt') as f:
  #   for i, line in enumerate(f):
  #     if i in recipes_ind:
  #       recipes.append(line.split())

  # uncomment the next line out if data is loaded
  recipes = [data[index].split() for index, _ in similar_recipes]

  if verbose:
    print('Recipes\' ingredients:', recipes)

  # get the important and substitutable ingredients
  imp, subs = get_substitutable_ings(recipes)

  if verbose:
    print("Important: ", imp)  
    print("Substitutable: ", subs)  

  substitutions = []
  # loop through every ingredients in the passed recipe
  for ingredient in ingredients:
    # if it's substitutable in the recipe,
    if ingredient in subs:
      # check if the ingredient substitution model outputs something that
      # is also substitutable in the recipe
      similar_ingredients = get_top_candidates(ingredient, k=5)
      for sim_ing, confidence in similar_ingredients:
        # add it to the list of possible substitutions if it is
        if sim_ing in subs and sim_ing not in ingredients:
          substitutions.append({'from': ingredient, 'to': sim_ing, 'confidence': confidence})

  # remove duplicates
  substitutions = [dict(t) for t in {tuple(s.items()) for s in substitutions}]
  # sort by how confident we are of the substitution being a viable one
  substitutions.sort(key=lambda x: x['confidence'], reverse=True)

  # calculate total recipe carbon
  total_ghg = sum([ghg_dict[ing] for ing in ingredients])

  # only return substitutions of ingredients that are high
  # carbon (>= 20% or total recipe ghg) and if the subtitute has a less ghg
  substitutions = list(filter(
      lambda sub: ghg_dict[sub['from']] >= ghg_dict[sub['to']] and
      ghg_dict[sub['from']] >= 0.2 * total_ghg,
      substitutions
      )
  )

  # add ghg difference and percent reduction to substitutions
  for sub in substitutions:
    sub['ghg_difference'] = ghg_dict[sub['from']] - ghg_dict[sub['to']]
    if total_ghg == 0:
      sub['percent_reduction'] = 0
    else:
      sub['percent_reduction'] = sub['ghg_difference'] / total_ghg * 100

  return substitutions

def get_substitutable_ings(recipes, no_above = 0.8):
  """
  Seperates the important ingredients from the substituable one

  Parameters:
    recipes: list of *tokenized* recipes
    no_above: the minimum fraction to be considered important

  Returns
    important_ings
    subs_ings
  """
  id2word = Dictionary(recipes)
  all_ings = list(id2word.values())
  id2word.filter_extremes(no_below=0, no_above=no_above)
  # after filtering (substitutable)
  subs_ings = list(id2word.values())
  important_ings = list(filter(lambda x: x not in subs_ings, all_ings))
  return important_ings, subs_ings


# Example Usage

In [39]:
ingredients = [
  "1 Tbsp Vegetable Oil",
  "3 Cloves Garlic",
  "1 Onion",
  "1 Carrot",
  "1 Stick Celery",
  "1 Tsp Marmite",
  "1 Vegetable Stock Cube",
  "1 Can Tomatoes,Chopped",
  "100ml Red Wine",
  "125g Dehydrated Soya Mince",
  "400g Spaghetti",
  "1 Tbsp Parmesan Cheese"
]

get_substitutions(ingredients, tfidf, tfidf_vecs)

[{'from': 'parmesan',
  'to': 'parsley',
  'confidence': 3.420124389639847,
  'ghg_difference': 20.802999999999997,
  'percent_reduction': 61.18169519440033},
 {'from': 'parmesan',
  'to': 'basil',
  'confidence': 3.372572536294494,
  'ghg_difference': 20.802999999999997,
  'percent_reduction': 61.18169519440033},
 {'from': 'parmesan',
  'to': 'cheddar',
  'confidence': 3.357463172007058,
  'ghg_difference': 0.0,
  'percent_reduction': 0.0},
 {'from': 'parmesan',
  'to': 'cheese',
  'confidence': 3.272324465175727,
  'ghg_difference': 0.0,
  'percent_reduction': 0.0}]

In [32]:
ingredients = [
  "Spaghetti",
  "Bacon",     
  "Olive oil",
  "Egg",
  "Parmesan"
]

instructions = [
  "Cook the pasta in a pan of boiling salted water according to the packet instructions.",
  "Slice the bacon and place in a non-stick frying pan on a medium heat with half a tablespoon of olive oil and a really good pinch of black pepper. Leave it to get super-golden and crispy, tossing occasionally, then turn off the heat.",
  "Meanwhile, beat the eggs in a bowl, then finely grate in the Parmesan and mix well.",
  "Use tongs to transfer your pasta straight into the pan and toss with the bacon.",
  "Pour the Parmesan eggs into the pan, and keep everything moving, loosening with splashes of the pasta cooking water until you have a silky sauce. Make sure the pan isn\u2019t too hot otherwise the eggs will scramble.",
  "Plate up the pasta, and finish with an extra grating of Parmesan.",
  "Tips",
  "EASY SWAPS",
  "Use Cheddar cheese instead of Parmesan.",
  "If you haven\u2019t got any eggs, don\u2019t worry \u2013 the starchy pasta water will be enough.",
  "Please enable targetting cookies to show this banner"
]

get_substitutions(ingredients, doc2vec, doc2vec_vecs, instructions=instructions)

[{'from': 'parmesan',
  'to': 'parsley',
  'confidence': 3.420124389639847,
  'ghg_difference': 20.802999999999997,
  'percent_reduction': 51.47473647745829},
 {'from': 'parmesan',
  'to': 'basil',
  'confidence': 3.372572536294494,
  'ghg_difference': 20.802999999999997,
  'percent_reduction': 51.47473647745829},
 {'from': 'parmesan',
  'to': 'cheese',
  'confidence': 3.272324465175727,
  'ghg_difference': 0.0,
  'percent_reduction': 0.0}]