In [1]:
import numpy as np
import pandas as pd
from operator import or_

## Load the dataset

The dataset has been created looking at some Japanese restaurants' menus (located in Italy):
- KOI, Turin: http://www.ristorantekoi.it/carta.pdf
- SUSHI SUN, Turin: https://www.sushisuntorino.it/menu
- HAYASHI SUSHI, Rome: https://www.ristorantegiapponese-roma.com/menu-completo
- DAIFUKU - Rome: https://s43261ed02e900f3e.jimcontent.com/download/version/1353022243/module/6993894386/name/daifuku%20menu%20buffet.pdf

In [2]:
# load the recipes CSV file (Japanese dishes)
data = pd.read_csv('jappo.csv', sep=';').fillna('')
data.head(5)

Unnamed: 0,RECIPE,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,AMAEBI FRY,gamberi,pasta kataifi,salsa teriyaki,maionese piccante,,,,,,,,,,,
1,NORI EDAMAME,soya,alga Nori,Shichimi,,,,,,,,,,,,
2,MISO SOUP,Zuppa,alga Nori,tofu,,,,,,,,,,,,
3,OSUMASHI,Zuppa,alga Nori,asparagi,,,,,,,,,,,,
4,SHRIMP-SALMON,tempura,gamberi,philadelphia,salmone,tobiko,salsa teriyaki,,,,,,,,,


In [3]:
print('There are {} recipes'.format(data.shape[0]))

There are 147 recipes


## Data pre-processing

In [4]:
# recipes must be uppercase
data['RECIPE'] = data['RECIPE'].str.upper()
# ingredients must be lowercase
for col in range(1, 16):
    col = str(col)
    data[col] = data[col].str.lower()

In [5]:
# retrieve the list of all the available ingredients
ingredients = list(reduce(or_, list(map(lambda col: set(data[str(col)]), range(1, 16)))))
ingredients.remove('')

In [6]:
print('There are {} ingredients'.format(len(ingredients)))

There are 77 ingredients


<div class="alert alert-warning">
TODO: The number of ingredients, with respect to the number of recipes, is not so small...
</div>

In [7]:
ingredients.sort()
ingredients

['alga nori',
 'ananas',
 'anguilla',
 'arachidi',
 'asparagi',
 'astice',
 'avocado',
 'branzino',
 'capesante',
 'caviale',
 'cetriolo',
 'crunch',
 'daikon',
 'erba cipollina',
 'fagiolini',
 'farina di tapioca',
 'fiori di zucca',
 'foglia di shiso',
 'foie gras',
 'formaggio',
 'fragola',
 'gamberi',
 'granchio',
 'ikura',
 'insalata',
 'maionese',
 'maionese piccante',
 'mandorle',
 'mango',
 'manzo',
 'menta',
 'nero di seppia',
 'olio',
 'olio tartufato',
 'pane giapponese',
 'pasta kataifi',
 'patate',
 'pesce bianco',
 'philadelphia',
 'pinoli',
 'pistacchio',
 'pollo',
 'pomodoro',
 'ricciola',
 'riso',
 'rucola',
 'salmone',
 'salsa al mango',
 'salsa dello chef',
 'salsa di miso',
 'salsa kabayaki',
 'salsa piccante',
 'salsa ponzu',
 'salsa teriyaki',
 'salsa yuzu',
 'scampi',
 'sesamo',
 'shichimi',
 'soya',
 'spaghetti',
 'spezie',
 'spigola',
 'surimi',
 'tabasco',
 'tempura',
 'tobiko',
 'tofu',
 'tonno',
 'uova',
 'uova di pesce volante',
 'uova di quaglia',
 'uova d

In [8]:
# create the ingredients dictionary: key is the ingredient, value is its index
ingredients_dict = {'': ''}
for i in range(len(ingredients)):
    ingredients_dict[ingredients[i]] = i

In [9]:
# map the ingredients inside the dataframe using IDs
for i in range(1, 16):
    data[str(i)] = data[str(i)].apply(lambda x: ingredients_dict[x])

In [10]:
data.head(5)

Unnamed: 0,RECIPE,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,AMAEBI FRY,21,35,53,26.0,,,,,,,,,,,
1,NORI EDAMAME,58,0,57,,,,,,,,,,,,
2,MISO SOUP,76,0,66,,,,,,,,,,,,
3,OSUMASHI,76,0,4,,,,,,,,,,,,
4,SHRIMP-SALMON,64,21,38,46.0,65.0,53.0,,,,,,,,,


## Data integrity controls

In [11]:
def ingredients_for(recipe_name):
    '''
    Retrieve the ingredients for the recipe as a list
    '''
    # check that the receipt exists
    if not np.any(data['RECIPE'] == recipe_name):
        raise ValueError('The recipe "{}" does not exists'.format(recipe_name))
    # return its ingredients
    return np.setdiff1d(data[data['RECIPE'] == recipe_name].values[0][1:], [''])

def uses_ingredients(r, l):
    '''
    Given a recipe name and a list of ingredients, the method checks if the recipe uses all the specified ingredients (among the others)
    '''
    return set(ingredients_for(r)) >= set(l)

def compare_ingredient_lists(l1, l2):
    '''
    Check that the two given lists of ingredients refer to a same receipe
    '''
    # cast the lists as sets
    s1 = set(l1)
    s2 = set(l2)
    # result
    return s1 == s2

def compare_recipes(r1, r2):
    '''
    Control if the two recipes use the same ingredients (so: if they are duplicates)
    '''
    l1 = ingredients_for(r1)
    l2 = ingredients_for(r2)
    return compare_ingredient_lists(l1, l2)

def check_duplicated_ingredients(r):
    '''
    For the given recipe, check that no ingredients are duplicated
    '''
    # retrieve the list of ingredients
    l = sorted(ingredients_for(r))
    # if there is only one ingredient, for sure we have no duplicates
    if len(l) == 1:
        return False
    # check that there are no duplicates
    return np.any(list(map(lambda x: x[0]!='' and x[0]==x[1], zip(l[:-1], l[1:]))))

def get_recipe(ingredient_list, subset_match=False):
    '''
    Given a list of ingredients, this method return the recipe (if any) that uses them.
    If "subset_match" is True, the method returns the list of recipes that uses the specified ingredients (among the others) 
    '''
    # retrieve the list of recipes
    recipes = data['RECIPE'].values
    # result variable
    if subset_match:
        res = []
    else:
        res = None
    # control every recipe
    for r in recipes:
        if subset_match:
            # if the recipe uses the ingredients, save its name as result
            if uses_ingredients(r, ingredient_list):
                res.append(r)
        else:
            # if the ingredients of the recipe are exactly the ones provided, we have found our answer
            if compare_ingredient_lists(ingredients_for(r), ingredient_list):
                res = r
                break
    # final result
    return res

In [12]:
# inside recipes, the same ingredient should never appear twice.
# also, we want to work only on recipes with at least three ingredients
for r in data['RECIPE']:
    assert not check_duplicated_ingredients(r), 'Recipe "{}" has duplicated ingredients'.format(r)
    assert len(ingredients_for(r)) >= 3

# we must drop recipes with different names, but the same set of ingredients
recipes = data['RECIPE'].values
for i in range(len(recipes)):
    r1 = recipes[i]
    for j in range(i+1, len(recipes)):
        r2 = recipes[j]
        assert not compare_recipes(r1, r2), 'Recipes "{}" and "{}" are actually duplicates'.format(r1, r2)

# every recipe should have a different name
recipes.sort()
assert not np.any(list(map(lambda x: x[0]==x[1], zip(recipes[:-1], recipes[1:])))), 'There are different recipes with the same name'


print('Dataset is now ready.')

Dataset is now ready.


## Getting ready

In [13]:
# convert the dataframe as a list of recipes (who are lists of ingredients)
menu = [ingredients_for(r[0]).tolist() for r in data.values]