# Recipe2BetterRecipe 🍐🍋 🍊🍅

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.neural_network import BernoulliRBM

In [None]:
from src.utils import is_vegan, preprocess, decompose_vocab
from src.utils import plot_co2_estimates, plot_histograms, plot_food_bias_seasons_Paris, plot_Zipf

## 1. Preprocess recipe corpus

In [None]:
with open('recipes/train.json', 'r', encoding='utf-8') as fp:
    data = json.load(fp) # {'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives'] }
with open('recipes/test.json', 'r', encoding='utf-8') as fp:
    data += json.load(fp) # {'id': 10259, 'ingredients': ['romaine lettuce', 'black olives'] }

In [None]:
np.random.seed(123) # set seed for reproductibility

In [None]:
corpus = [preprocess(r['ingredients']) for r in data] # preprocess 49718 recipes (is_alpha, lemmatize, - adjectives)
corpus = [r for r in corpus if len(r)>=2] # keep 49687 recipes with 2+ ingredients

In [None]:
vegan_corpus = [r for r in corpus if is_vegan(r)] # keep 12436 vegan recipes (meat, fish, dairy free!)

In [None]:
raw_vocab = set([token for r in corpus for token in r])
raw_vegan_vocab = set([token for r in vegan_corpus for token in r])

In [None]:
print(vegan_corpus[0])
print('{} vegan recipes'.format(len(vegan_corpus)))
print('{} unique tokens'.format(len(set(raw_vegan_vocab))))

## 2. Load seasonal fruits & vegs vocab

In [None]:
seasons = {} # seasonal fruits & vegs in Paris, from Leers et Fessard (2017)
with open('seasons/paris-fruits.txt', 'r') as fp: 
    for month_id, foodlist in enumerate(fp.readlines()):
        foodlist = [food.strip() for food in foodlist.replace('\n','').split(',')]
        seasons[month_id] = foodlist
        
with open('seasons/paris-vegs.txt', 'r') as fp: 
    for month_id, foodlist in enumerate(fp.readlines()):
        foodlist = [food.strip() for food in foodlist.replace('\n','').split(',')]
        seasons[month_id] += foodlist

In [None]:
fruits_and_vegs = set()
for foodlist in seasons.values():
    for food in foodlist:
        fruits_and_vegs.add(food)

## 3. Remap corpus (vocab)

In [None]:
roots = decompose_vocab(raw_vegan_vocab, fixed_points=fruits_and_vegs)

In [None]:
vegan_corpus = [ list(set([food if food not in roots else roots[food] for food in r])) for r in vegan_corpus] #######

In [None]:
vocab = set(np.sort([food for foodlist in vegan_corpus for food in foodlist]))

In [None]:
print(vegan_corpus[0])
print('{} vegan recipes'.format(len(vegan_corpus)))
print('{} unique tokens'.format(len(set(vocab))))

In [None]:
plot_histograms(x1=[len(x) for x in corpus],
                x2=[len(x) for x in vegan_corpus],
                title1='All recipes \n μ={:.2f}'.format(np.mean([len(x) for x in corpus])),
                title2='Vegan recipes (preprocessed+remapped) \n μ={:.2f}'.format(np.mean([len(x) for x in vegan_corpus])),
                bins1=65, bins2=30, filename='remapped_vegan_recipes.png')

## 4. Food2ID mapping

In [None]:
vocab_map = dict(zip(list(vocab),np.arange(len(vocab))))
inv_map = {v: k for k, v in vocab_map.items()}

In [None]:
vocab_counts = dict(zip(list(vocab),np.zeros(len(vocab))))
for foodlist in vegan_corpus:
    for food in foodlist:
        vocab_counts[food] += 1/len(vegan_corpus)

In [None]:
histogram = np.sort(list(vocab_counts.values()))[::-1]

## 5. Intersect seasonality and vegan_corpus

In [None]:
fruits_and_vegs = set()
misunderstood = set()
for foodlist in seasons.values():
    for food in foodlist:
        if food in vocab_counts:
            fruits_and_vegs.add(food)
        else:
            misunderstood.add(food)

fruits_and_vegs = np.array(list(fruits_and_vegs))
print(len(fruits_and_vegs),'fruits and vegs understood')
print('Misunderstood:',misunderstood)

In [None]:
fruits_and_vegs_bias = np.array([vocab_counts[food] for food in fruits_and_vegs])
fruits_and_vegs_ranked = np.array([np.where(histogram==bias)[0][0] for bias in fruits_and_vegs_bias])
ranked_idx = np.argsort(fruits_and_vegs_ranked)

In [None]:
# plot food bias (word frequency) + seasonality in december
top_k = 30
month_id = 11
plot_food_bias_seasons_Paris(fruits_and_vegs, fruits_and_vegs_bias, ranked_idx[::-1][-top_k:], seasons, month=month_id, n_subplots=1, figsize=(6,10))

In [None]:
plot_Zipf(vocab, histogram, fruits_and_vegs_ranked, fruits_and_vegs_bias, ranked_idx, filename='Zipflaw_power.png')

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
def my_autopct(pct):
    return ('%.2f' % pct) if pct > 2. else ''

top_k = 14 # show top_k
labels = list(fruits_and_vegs[ranked_idx][:top_k]) + ['{} other \n fruits/vegs'.format(len(fruits_and_vegs)-top_k)]
sizes = list(fruits_and_vegs_bias[ranked_idx][:top_k]) + [np.sum(fruits_and_vegs_bias[ranked_idx][top_k:])]
explode = [0.]*top_k + [0.1] # only "explode" the 2nd slice (i.e. 'Hogs')

fig2, ax2 = plt.subplots(figsize=(10,10))
ax2.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=90, textprops={'fontsize': 20})
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.subplots_adjust(wspace=None, hspace=None)
fig2.tight_layout()
plt.savefig('img/Zipflaw_pie.png')
plt.show()

## 6. Basket of food, aka bag of words

In [None]:
X = [[vocab_map[food] for food in r] for r in vegan_corpus]

In [None]:
X_ = []
for bow in X:
    recipe_vector = np.zeros(len(vocab_map))
    for k in bow:
        recipe_vector[k] = 1
    if len(bow)>0:
        X_.append(recipe_vector)
X_ = np.array(X_) # (n_recipes, n_ingredients) aka (n_samples, n_features)

In [None]:
print(X_.shape)
print('sparsity {:.2f}%'.format(100*np.sum(X_)/(X_.shape[0]*X_.shape[1])))

## 7. Fit RBM on dataset X

In [None]:
model = BernoulliRBM(n_components=150, batch_size=32, learning_rate=0.1, n_iter=100, verbose=0)
model.fit(X_)

In [None]:
h = model.intercept_hidden_ 
v = model.intercept_visible_ 
W = model.components_ 

In [None]:
plt.figure(1,figsize=(15,5))
plt.subplot(121)
plt.scatter(np.arange(len(h)), np.sort(h), c='black', marker='+')
plt.xlabel('hidden states')
plt.ylabel('bias')
plt.title('Sorted diets bias (c)')
plt.subplot(122)
plt.scatter(np.arange(len(v)), np.sort(v), c='black', alpha=0.3, marker='+')
plt.xlabel('visible states')
plt.ylabel('bias')
plt.title('Sorted visible bias (b)')
plt.savefig('img/RBM_biases.png')
plt.show()

## 8. Bias 12 models for Paris (dynamic) +1 (vanilla static model)

In [None]:
dynamic_model = {}

In [None]:
fruits_and_vegs_vector = np.zeros(len(vocab_map)) # 1 if fruits or vegs (with know season), 0 otherwise
for food in fruits_and_vegs:
    fruits_and_vegs_vector[vocab_map[food]] = 1

In [None]:
for month_id in range(12):
    v_dynamic = np.copy(v)
    out_of_seasons = np.copy(fruits_and_vegs_vector) # all fruits/vegs

    for food in fruits_and_vegs:
        if food in seasons[month_id]:
            out_of_seasons[vocab_map[food]] = 0 # remove food from out_of_season
            #v_dynamic[vocab_map[food]] += 1. # augment probability of seasonal fruits/vegs
        else:
            v_dynamic[vocab_map[food]] -= 15. # lower probability of out of season fruits/vegs
            
    out_of_seasons = np.int32(out_of_seasons) # +1 if fruit of veg out of season, 0 otherwise
    dynamic_model[month_id] = {'v':v_dynamic, 'HS':out_of_seasons}

## 9. Recipe2BetterRecipe stats: % seasonal ingredients for each month and GHG excess

In [None]:
plot_co2_estimates(threshold=10.) # from Bon Pour le Climat

In [None]:
def get_metrics(X_):
    metrics = {'OOS_prop':[], 'GHG_low':[], 'GHG_high':[], 'counts':[]}
    for month_id in range(12):
        n_ingredients_ = np.sum(X_, axis=1) # number of ingredients for each recipe
        n_ingredients_HS_ = np.sum(X_ * np.tile(dynamic_model[month_id]['HS'], (X_.shape[0], 1)), axis=1) # number of out of seasons fruits/vegs
        HS_ratio_ = n_ingredients_HS_/n_ingredients_
        metrics['OOS_prop'].append(100*np.mean(HS_ratio_)) # percentage of ingredients out of season
        metrics['GHG_low'].append(5.7*np.mean(HS_ratio_)) # lower bound on GHG excess (out of season, in kg CO2/kg)
        metrics['GHG_high'].append(13.3*np.mean(HS_ratio_)) # upper bound on GHG excess
        metrics['counts'].append(100*np.sum(HS_ratio_==0)/len(X_)) # percentage of recipes seasonal 
    return metrics

In [None]:
def R2BR(model, bias, X):
    model.intercept_visible_ = bias
    X = model.gibbs(X)
    X = np.array([r for r in X if np.sum(r)>0])
    return X

In [None]:
stats = {'base': {}, 'static':{}, 'dynamic':{}} # metrics for different behaviors (no resample, static, dynamic)

In [None]:
# Original dataset
metrics = get_metrics(X_)
stats['base'] = metrics

In [None]:
X1_ = R2BR(model, v, X_) # RBM (static)
metrics = get_metrics(X1_)
stats['static'] = metrics

In [None]:
metrics = {'OOS_prop':[], 'GHG_low':[], 'GHG_high':[], 'counts':[]}
for month_id in range(12):
    X2_ = R2BR(model, dynamic_model[month_id]['v'], X_) # RBM (dynamic)

    n_ingredients_ = np.sum(X2_, axis=1) # number of ingredients
    n_ingredients_ = np.maximum(n_ingredients_,1)
    n_ingredients_HS_ = np.sum(X2_ * np.tile(dynamic_model[month_id]['HS'], (X2_.shape[0], 1)), axis=1) # number of out of seasons fruits/vegs
    HS_ratio_ = n_ingredients_HS_/n_ingredients_

    metrics['OOS_prop'].append(100*np.mean(HS_ratio_))
    metrics['GHG_low'].append(5.7*np.mean(HS_ratio_))
    metrics['GHG_high'].append(13.3*np.mean(HS_ratio_))
    metrics['counts'].append(100*np.sum(HS_ratio_==0)/len(X_))
stats['dynamic'] = metrics

In [None]:
#fig = plt.figure(1,figsize=(10,6))
#plt.scatter(np.arange(12), stats['base']['OOS_prop'], label='Vegan 10k', color='black', marker='o')
#plt.plot(np.arange(12), stats['static']['OOS_prop'], label='RBM (static)', color='gray', linestyle='--')
#plt.plot(np.arange(12), stats['dynamic']['OOS_prop'], label='Recipe2BetterRecipe (dynamic)', color='green')
#plt.xlabel('Months', size=18)
#plt.ylabel('Proportion \n out-of-season ingredients (%)', size=18)
#plt.legend()
#fig.tight_layout()
#plt.savefig('img/r2br_results_OOS_prop.png')
#plt.show()

In [None]:
#fig = plt.figure(1,figsize=(10,6))
#plt.scatter(np.arange(12), stats['base']['counts'], label='Vegan 10k', color='black', marker='o')
#plt.plot(np.arange(12), stats['static']['counts'], label='RBM (static)', color='gray', linestyle='--')
#plt.plot(np.arange(12), stats['dynamic']['counts'], label='Recipe2BetterRecipe (dynamic)', color='green')
#plt.xlabel('Months', size=18)
#plt.ylabel('Proportion \n local-seasonal recipes (%)', size=18)
#plt.legend()
#fig.tight_layout()
#plt.savefig('img/r2br_results_counts.png')
#plt.show()

In [None]:
fig = plt.figure(1,figsize=(10,6))
plt.scatter(np.arange(12), stats['base']['GHG_high'], label='Vegan 10k', color='black', marker='o')
plt.scatter(np.arange(12), stats['base']['GHG_low'], color='black', marker='o')
plt.plot(np.arange(12), stats['static']['GHG_high'], label='RBM (static)', color='gray', linestyle='--')
plt.plot(np.arange(12), stats['static']['GHG_low'], color='gray', linestyle='--')
plt.plot(np.arange(12), stats['dynamic']['GHG_high'], label='Recipe2BetterRecipe (dynamic)', color='green')
plt.plot(np.arange(12), stats['dynamic']['GHG_low'], color='green')
plt.xlabel('Months', size=18)
plt.ylabel('ΔG \n in kg CO2/kg', size=18)
plt.legend()
fig.tight_layout()
plt.savefig('img/r2br_results_excess.png')
plt.show()

## 10. Examples: Recipes out-of-season in December

In [None]:
def recipe2vec(vocab_map, recipe):
    recipe_vector = np.zeros((1,len(vocab_map)))
    for food in recipe:
        recipe_vector[0,vocab_map[food]] = 1
    return recipe_vector

In [None]:
def vec2recipe(inv_map, recipe_vector):
    _ , fids = np.where(recipe_vector>0)
    recipe = [inv_map[fid] for fid in fids]
    return np.sort(recipe)

In [None]:
def revise_recipe(model, bias, recipe_vector):
    model.intercept_visible_ = bias # static model
    return model.gibbs(recipe_vector)

In [None]:
def get_ingredients_HS(recipe_vector, out_of_seasons):
    ingredients_HS = recipe_vector * np.tile(out_of_seasons, (1, 1))
    _, oos_id = np.where(ingredients_HS>0) # out of seasons ingredients id
    return ingredients_HS, oos_id

In [None]:
def r2br(recipe, recipe_vector):
    print('\n Original Recipe: {}'.format(', '.join(np.sort(recipe)))) # print recipe

    for month_id in range(12):
        ingredients_HS_, oos_id = get_ingredients_HS(recipe_vector, dynamic_model[month_id]['HS'])
        if len(oos_id)==0:
            print('{} 🏷️ Local-seasonal'.format(month_id+1))
        else:
            deprecated = ', '.join([inv_map[fid].capitalize() for fid in oos_id])
            
            revisited = False
            while not revisited:
                recipe_vector_dynamic = revise_recipe(model, dynamic_model[month_id]['v'], recipe_vector) # dynamic model
                ingredients_HS_, oos_id = get_ingredients_HS(recipe_vector_dynamic, dynamic_model[month_id]['HS'])
                alternative = [food for food in vec2recipe(inv_map, recipe_vector_dynamic ) if food in fruits_and_vegs and food not in recipe]
                if len(alternative)>0:
                    revisited=True
            alternative = ', '.join(alternative)
            print('{} ♻️ {} >> {}'.format(month_id+1, deprecated, alternative))

In [None]:
has_availibility_not_december = set() # seasonal but not in December! (for eval)
for i in range(len(vegan_corpus)):
    recipe = vegan_corpus[i]
    recipe_vector = recipe2vec(vocab_map, recipe)
    
    for month_id in range(12):
        ingredients_HS_, oos_id = get_ingredients_HS(recipe_vector, dynamic_model[month_id]['HS'])
        if len(oos_id)==0:
            has_availibility_not_december.add(i)
        if len(oos_id)==0 and month_id==11:
            has_availibility_not_december.remove(i)

In [None]:
for i, id_ in enumerate(has_availibility_not_december):
    recipe = vegan_corpus[id_]
    recipe_vector = recipe2vec(vocab_map, recipe)
    r2br(recipe, recipe_vector)
    break # comment to view all examples