# Deep Learning

In [1]:
import pandas as pd
import numpy as np
import json
import string
import re

### Load dataset

In [2]:
ingredients_simplified = pd.read_csv('Ingredients101/Annotations/ingredients_simplified.txt', sep="\n", header=None)
ingredients_simplified.columns = ['ingredients']
ingredients_simplified['ingredients'] = ingredients_simplified['ingredients'].apply(lambda x: x.split(","))

In [3]:
ingredients = []
for key, row in ingredients_simplified.iterrows():
    for ingredient in row['ingredients']:        
        if not ingredient in ingredients:
            ingredients.append(ingredient)
len(ingredients)

227

In [4]:
ingredients = pd.DataFrame(ingredients, columns=['ingredient'])
ingredients['vegetarian'] = ""
ingredients

Unnamed: 0,ingredient,vegetarian
0,butter,
1,flour,
2,sugar,
3,brown sugar,
4,apple,
...,...,...
222,brewed espresso,
223,frangelico,
224,rum,
225,lady fingers,


### Load the vegetarian ingredients

In [5]:
with open('foodclassification/database.json') as f:
  data = json.load(f)

In [6]:
vegetarian_ingredients = data['vegetarian'] + data['vegan']
vegetarian_ingredients = [ingredient.lower().translate(str.maketrans('', '', string.punctuation)) for ingredient in set(vegetarian_ingredients)]
vegetarian_ingredients[:5]

['agar agar powder',
 'cocoa butter',
 'fruit puree',
 'baking powder',
 'rice milk']

### Filter the vegetarian ingredients based on the blacklist of Ingredients 101

In [7]:
with open('Ingredients101/ingredients_simplification/blacklist.txt') as f:
    blacklist = f.read().splitlines()
print(blacklist[:5])

['powdered', 'of', 'light', 'ground', 'sauce']


In [8]:
pattern = "(^|\s)" + "($|\s)|(^|\s)".join(blacklist) + "($|\s)"
vegetarian_ingredients_clean = list(filter(None, [re.sub(pattern, "", ingredient) for ingredient in vegetarian_ingredients]))

In [9]:
len(vegetarian_ingredients_clean)

815

In [10]:
ingredients.loc[ingredients['ingredient'].str.contains("|".join(vegetarian_ingredients_clean)), 'vegetarian'] = True
print(ingredients['vegetarian'].value_counts())
ingredients

        150
True     77
Name: vegetarian, dtype: int64


Unnamed: 0,ingredient,vegetarian
0,butter,
1,flour,True
2,sugar,True
3,brown sugar,True
4,apple,True
...,...,...
222,brewed espresso,
223,frangelico,
224,rum,
225,lady fingers,


In [11]:
non_vegetarian_ingredients = [
     'anchov',
     'bacon',
     'bass',
     'beef',
     'beef',
     'boar',
     'bream',
     'burger',
     'clams',
     'clam',
     'caviar',
     'chicken',
     'collar',
     'crab',
     'duck',
     'eel',
     'fat',
     'fish',
     'filet',
     'flank',
     'foie',
     'game',
     'goose',
     'gravy',
     'hare',
     'ham',
     'horse',
     'kipper',
     'kidney',
     'lamb',
     'lamb',
     'liver',
     'lobster',
     'meat',
     'morel',
     'mussels',
     'octopus',
     'oyster',
     'pigeon',
     'pork',
     'poultry',
     'quail',
     'rump',
     'rabbit',
     'salami',
     'salmon',
     'seafood',
     'shrimp',
     'sirloin',
     'silverside'
     'smokie',
     'snails',
     'squid',
     'steak',
     'stuffing',
     'tongue',
     'tuna',
     'turkey',
     'trout',
     'veal',
     'zander'
]

In [12]:
ingredients.loc[ingredients['ingredient'].str.contains("|".join(non_vegetarian_ingredients)), 'vegetarian'] = False

In [13]:
ingredients['vegetarian'].value_counts()

         126
True      77
False     24
Name: vegetarian, dtype: int64

In [14]:
ingredients[ingredients['vegetarian'] == ""]['ingredient'].values

array(['butter', 'nut', 'baby back ribs', 'worcestershire', 'gin',
       'bread', 'phyllo dough', 'honey', 'lemon', 'baklava', 'pepper',
       'baby arugula', 'asiago', 'shallot', 'capers', 'crostini', 'beets',
       'gorgonzola', 'red wine', 'herbs', 'milk', 'grain', 'cucumber',
       'oil', 'seeds', 'rolls', 'brie', 'cheddar', 'salsa', 'baguette',
       'plain greek yogurt', 'lettuce', 'croutons', 'almond', 'liqueur',
       'cocktail', 'red snapper', 'avocado', 'tortillas', 'curry',
       'broth', 'greek yogurt', 'cumin', 'whipping',
       "morningstar farms chick'n patties", 'tartar', 'berries', 'cake',
       'soda', 'icing', 'mayonnaise', 'spray', 'glaze',
       'colored sprinkles', 'edam', 'english muffin', 'coriander',
       'cardamom', 'beer', 'cod', 'vermouth', 'calamari', 'masa harina',
       'cherry', 'feta', 'olive', 'gyoza wrappers', 'ketchup', 'shiitake',
       'dogs', 'dog bun', 'relish', 'anaheim chile', 'adobo', 'chickpea',
       'italian', 'bran', 'white 

In [15]:
ingredients.loc[ingredients['vegetarian'] == "", 'vegetarian'] = True

In [16]:
ingredients['vegetarian'].value_counts()

True     203
False     24
Name: vegetarian, dtype: int64

In [17]:
# non_vegetarian_df = [[ingredient, False] for ingredient in non_vegetarian_ingredients]
# non_vegetarian_df = pd.DataFrame(columns = ingredients.columns, data = non_vegetarian_df)
# ingredients = pd.concat([ingredients, non_vegetarian_df])

In [18]:
ingredients

Unnamed: 0,ingredient,vegetarian
0,butter,True
1,flour,True
2,sugar,True
3,brown sugar,True
4,apple,True
...,...,...
222,brewed espresso,True
223,frangelico,True
224,rum,True
225,lady fingers,True


In [19]:
ingredients.to_csv('ingredients_labeled.csv', header=True, index=False)