# Deep Learning

In [1]:
import pandas as pd
import numpy as np
import json
import string
import re


### Load dataset

In [2]:
ingredients = pd.read_csv('Ingredients101/ingredients_simplification/baseIngredients.txt', sep=",", header=None, squeeze=True).T
ingredients.columns = ['ingredient']
ingredients['vegetarian'] = ""
ingredients.shape

(1095, 2)

### Load the vegetarian ingredients

In [3]:
with open('foodclassification/database.json') as f:
  data = json.load(f)

In [4]:
vegetarian_ingredients = data['vegetarian'] + data['vegan']
vegetarian_ingredients = [ingredient.lower().translate(str.maketrans('', '', string.punctuation)) for ingredient in set(vegetarian_ingredients)]
vegetarian_ingredients[:5]

['natural and artificial flavoring',
 'coconut milk',
 'potato starch  contains  milk',
 'shitake mushroom extract',
 'organic high oleic safflower andor sunflower oil']

### Filter the vegetarian ingredients based on the blacklist of Ingredients 101

In [5]:
with open('Ingredients101/ingredients_simplification/blacklist.txt') as f:
    blacklist = f.read().splitlines()
print(blacklist[:5])

['powdered', 'of', 'light', 'ground', 'sauce']


In [6]:
pattern = "(^|\s)" + "($|\s)|(^|\s)".join(blacklist) + "($|\s)"
vegetarian_ingredients_clean = list(filter(None, [re.sub(pattern, "", ingredient) for ingredient in vegetarian_ingredients]))

In [7]:
len(vegetarian_ingredients_clean)

815

In [8]:
ingredients.loc[ingredients['ingredient'].str.contains("|".join(vegetarian_ingredients_clean)), 'vegetarian'] = True
print(ingredients['vegetarian'].value_counts())
ingredients

        774
True    321
Name: vegetarian, dtype: int64


Unnamed: 0,ingredient,vegetarian
0,acidulated water,True
1,ackee,
2,acorn squash,True
3,aduki beans,True
4,advocaat,
...,...,...
1090,yeast,True
1091,yellow lentil,
1092,yoghurt,
1093,zander,


In [9]:
non_vegetarian_ingredients = [
     'anchov',
     'bacon',
     'bass',
     'beef',
     'beef',
     'boar',
     'bream',
     'burger',
     'clams',
     'caviar',
     'chicken',
     'collar',
     'crab',
     'duck',
     'eel',
     'fat',
     'fish',
     'filet',
     'flank',
     'foie',
     'game',
     'goose',
     'gravy',
     'hare',
     'ham',
     'horse',
     'kipper',
     'kidney',
     'lamb',
     'lamb',
     'liver',
     'lobster',
     'meat',
     'morel',
     'mussels',
     'octopus',
     'oyster',
     'pigeon',
     'pork',
     'poultry',
     'quail',
     'rump',
     'rabbit',
     'salami',
     'salmon',
     'seafood',
     'shrimp',
     'sirloin',
     'silverside'
     'smokie',
     'squid',
     'steak',
     'stuffing',
     'tongue',
     'tuna',
     'turkey',
     'trout',
     'veal',
     'zander'
]

In [10]:
ingredients.loc[ingredients['ingredient'].str.contains("|".join(non_vegetarian_ingredients)), 'vegetarian'] = False

In [11]:
ingredients['vegetarian'].value_counts()

         630
True     310
False    155
Name: vegetarian, dtype: int64

In [12]:
ingredients[ingredients['vegetarian'] == ""]['ingredient'].values

array(['ackee', 'advocaat', 'agar-agar', 'ale', 'alfalfa sprouts',
       'almond', 'almond essence', 'almond extract', 'amaranth',
       'amaretti', 'angelica', 'angostura bitters', 'anise', 'apricot',
       'apricot jam', 'arbroath smokie', 'argan oil', 'arrowroot',
       'artichoke', 'asafoetida', 'asparagus', 'aubergine', 'avocado',
       'bagel', 'baguette', 'bamboo shoots', 'banana', 'banana bread',
       'barbecue sauce', 'beer', 'beetroot', 'berry', 'betel leaves',
       'beurre manie', 'bilberries', "bird's-eye chillies", 'biscotti',
       'biscuits', 'blachan', 'black pudding', 'black treacle',
       'blackbean sauce', 'blackberry', 'blackcurrant',
       'blackcurrant juice drink', 'blini', 'blood orange', 'bok choi',
       'bonito', 'borage', 'bouquet garni', 'bran', 'brandy',
       'brandy butter', 'brandy snaps', 'bratwurst', 'brazil nut',
       'bread', 'bread roll', 'bread sauce', 'breadcrumbs', 'breadsticks',
       'bresaola', 'brie', 'brill', 'brioche', 'b

In [13]:
ingredients.loc[ingredients['vegetarian'] == "", 'vegetarian'] = True

In [14]:
ingredients['vegetarian'].value_counts()

True     940
False    155
Name: vegetarian, dtype: int64

In [15]:
non_vegetarian_df = [[ingredient, False] for ingredient in non_vegetarian_ingredients]
non_vegetarian_df = pd.DataFrame(columns = ingredients.columns, data = non_vegetarian_df)
ingredients = pd.concat([ingredients, non_vegetarian_df])

In [16]:
ingredients

Unnamed: 0,ingredient,vegetarian
0,acidulated water,True
1,ackee,True
2,acorn squash,True
3,aduki beans,True
4,advocaat,True
...,...,...
52,tuna,False
53,turkey,False
54,trout,False
55,veal,False


In [17]:
ingredients.to_csv('ingredients_labeled.csv', header=True, index=False)