In [1]:
import numpy as np
import pandas as pd

# Load Recipes

In [2]:
with np.load('../input/simplified-recipes-1M.npz', allow_pickle=True) as data:
    recipes = data['recipes']
    ingredients = data['ingredients']

# Create Helpers for Cleaning, Mapping and Clean Saving

In [3]:
recipes_test = recipes[0:10]

### Clean Helpers

In [4]:
file_delete = '../input/recipe_delete.csv'

In [5]:
df_delete = pd.read_csv(file_delete)

In [6]:
df_delete.head()

Unnamed: 0,to_delete
0,white
1,extrace
2,vegetable
3,leaves
4,packed


In [7]:
np_delete = df_delete.to_numpy()

### Mapping Helpers

In [8]:
file_mapping = '../input/recipe_mapping.csv'

In [9]:
df_mapping = pd.read_csv(file_mapping, delimiter=';')

In [10]:
df_mapping.head()

Unnamed: 0,recipes,classifier
0,chicken,chicken_breast
1,tomatoes,tomato
2,eggs,egg
3,onions,onion
4,black pepper,pepper


In [11]:
mapping_dict = dict(zip(df_mapping['recipes'], df_mapping['classifier']))

### Add Classifier Labels to Ingredients

In [12]:
file_classifier_labels = '../input/recipe_food_classifier_labels.csv'

In [13]:
df_classifier = pd.read_csv(file_classifier_labels, delimiter=';')

In [14]:
df_classifier.head()

Unnamed: 0,classifier_labels
0,almond
1,apple
2,apricot
3,avocado
4,banana


In [15]:
np_classifier = df_classifier.to_numpy()

In [16]:
ingredients = np.append(ingredients, np_classifier)

In [17]:
ingredients.shape

(3552,)

In [18]:
keys = ingredients
values = range(len(ingredients))

dict_ingr_ind_rev = dict(zip(keys, values))

# Clean and Map the Data Recipes

In [19]:
len(recipes)

1067557

In [20]:
recipes_cleaned = []

# if the recipe number i is not empty
for recipe in range(len(recipes)):
    if (len(recipes[recipe]) > 0):
        current_recipe = ingredients[recipes[recipe]]

        # clean not needed ingredients
        elements_to_delete = np.intersect1d(current_recipe,np_delete)
        mask_delete = np.isin(current_recipe, elements_to_delete) 
        clean_recipe = current_recipe[~mask_delete]

        # map cleaned ingredients to custom defined mapping
        mapped_recipe = [mapping_dict[ingredient] if (ingredient in mapping_dict.keys()) else ingredient for ingredient in clean_recipe]

        # turn ingredients back to indexes
        clean_map_recipe_idx = [dict_ingr_ind_rev[ingredient] for ingredient in mapped_recipe]

        # create uniques
        clean_map_recipe_idx = np.unique(clean_map_recipe_idx)

        # append cleaned recipes list
        recipes_cleaned.append(clean_map_recipe_idx)
        
    else:
        pass

In [21]:
len(recipes_cleaned)

1067556

In [22]:
recipes_cleaned = np.asarray(recipes_cleaned)

# Save Cleaned Ingredients & Recipes

In [23]:
#np.save('ingredients_clean', unique_ingredients)
np.save('../output/ingredients_clean', dict_ingr_ind_rev)

In [24]:
np.save('../output/recipes_clean', recipes_cleaned)

In [25]:
tmp = {v:k for (k,v) in dict_ingr_ind_rev.items()}

# For Web App

In [26]:
ingredients_stream = ['olive', 'paprika', 'radish']

In [27]:
ingredients_stream_ids = [dict_ingr_ind_rev[ingr] for ingr in ingredients_stream]

In [28]:
ingredients_stream_ids

[3527, 3529, 3537]

In [30]:
recipes_cleaned

array([array([  12,   72,  150,  560, 2081, 2754, 3510, 3540]),
       array([   0,    2,    8,   26,   31,   53,  224,  791,  798,  965, 1031,
       1884, 3528]),
       array([   0,    1,    3,    7,    8,   55,   63,   64,  102,  140,  262,
        602,  655,  664, 1196, 3509, 3527, 3529, 3540, 3545, 3548]),
       ...,
       array([   0,    3,   60,   72,   76,  155, 1102, 1603, 3520, 3548]),
       array([  0,   2,   4,   8,  47, 348]),
       array([   1,    2,    3,    7,    9,   31,   63,   64,   70,   72,  102,
        150,  201,  212, 1491, 1725, 3512, 3527])], dtype=object)

In [31]:
np.argwhere(recipes_cleaned == ingredients_stream_ids)

  """Entry point for launching an IPython kernel.


array([], shape=(0, 0), dtype=int64)