In [3]:
import pandas as pd
import numpy as np
import os 

from PIL import Image
from PIL import ImageFilter
from sklearn.preprocessing import LabelEncoder

IMAGE_PATH = 'Recipes5k/images/'
PATH_TO_RECIPES = 'Recipes5k/annotations/'
recipes = pd.read_csv('{}/ingredients_simplified_Recipes5k.txt'.format(PATH_TO_RECIPES), sep="\n", header=None, names=['ingredients'])

In [29]:
kaas = pd.read_csv('Ingredients101/Annotations/ingredients_simplified.txt', sep="\n", header=None)

In [30]:
kaas['ingredients'] = kaas[0].apply(lambda x: x.split(","))
kaas

Unnamed: 0,0,ingredients
0,"butter,flour,sugar,brown sugar,apple,cinnamon,nut","[butter, flour, sugar, brown sugar, apple, cin..."
1,"baby back ribs,apple,salt,mustard,brown sugar,...","[baby back ribs, apple, salt, mustard, brown s..."
2,"nut,cinnamon,bread,butter,phyllo dough,sugar,h...","[nut, cinnamon, bread, butter, phyllo dough, s..."
3,"beef,lemon,gin,salt,pepper,baby arugula,asiago","[beef, lemon, gin, salt, pepper, baby arugula,..."
4,"fat,steak,gin,shallot,parsley,capers,worcester...","[fat, steak, gin, shallot, parsley, capers, wo..."
...,...,...
96,"onion,garlic,beef,chili,salt,sugar,corn tortil...","[onion, garlic, beef, chili, salt, sugar, corn..."
97,"flour,egg,cold water,salt,konbu dashi,dashi,so...","[flour, egg, cold water, salt, konbu dashi, da..."
98,"egg,sugar,mascarpone,brewed espresso,frangelic...","[egg, sugar, mascarpone, brewed espresso, fran..."
99,"sushi grade tuna,scallions,tomato,cilantro,gin...","[sushi grade tuna, scallions, tomato, cilantro..."


In [31]:
ingredients_two = []
for key, row in kaas.iterrows():
    for ingredient in row['ingredients']:
        if not ingredient in ingredients_two:
            ingredients_two.append(ingredient)

In [32]:
len(ingredients_two)

227

In [2]:
def getSplit(split):
    image_urls = pd.read_csv('{}{}_images.txt'.format(PATH_TO_RECIPES, split), sep="\n", header=None, names=['url'])
    image_indices =  pd.read_csv('{}{}_labels.txt'.format(PATH_TO_RECIPES, split), sep="\n", header=None, names=['position'])
    
    recipes_labeled = image_indices.set_index('position').join(recipes).reset_index().join(image_urls)
    return recipes_labeled

data_train = getSplit('train')    
data_test = getSplit('test')
data_val = getSplit('val')

## Cleaning the ingredients by using the ingredients dataframe

In [3]:
clean_ingredients = pd.read_csv('ingredients_labeled.csv')['ingredient'].values

def filter_ingredients(ingredients):
    ingredients = ingredients.split(",")
    ingredients = list(filter(lambda x: x in clean_ingredients, ingredients))
    return ",".join(ingredients)

data_train['ingredients_cleaned'] = data_train['ingredients'].apply(filter_ingredients).dropna()
data_test['ingredients_cleaned'] = data_test['ingredients'].apply(filter_ingredients).dropna()
data_val['ingredients_cleaned'] = data_val['ingredients'].apply(filter_ingredients).dropna()
data_train.head()

Unnamed: 0,position,ingredients,url,ingredients_cleaned
0,69,"flour,salt,oil,cold water,apple,sugar,cinnamon...",apple_pie/20_homemade_apple_pie_hostedLargeUrl...,"flour,salt,oil,apple,sugar,cinnamon,butter"
1,91,"shell,pie,sugar,flour,cinnamon,apple,lemon,but...",apple_pie/43_homestyle_apple_pie_hostedLargeUr...,"pie,sugar,flour,cinnamon,apple,lemon,butter,milk"
2,77,"apple,lemon,sugar,flour,cinnamon,nut,butter,sa...",apple_pie/28_apple_pie_hostedLargeUrl.jpg,"apple,lemon,sugar,flour,cinnamon,nut,butter,sa..."
3,88,"pie,apple,sugar,corn starch,cinnamon,lemon,but...",apple_pie/39_classic_apple_pie_hostedLargeUrl.jpg,"pie,apple,sugar,cinnamon,lemon,butter,egg"
4,62,"apple,brown sugar,butter,cinnamon,allspice,nut...",apple_pie/13_mug_apple_pie_hostedLargeUrl.jpg,"apple,brown sugar,butter,cinnamon,allspice,nut..."


In [4]:
data_train['category'] = data_train['url'].apply(lambda x: x.split("/")[0])
data_test['category'] = data_train['url'].apply(lambda x: x.split("/")[0])
data_val['category'] = data_train['url'].apply(lambda x: x.split("/")[0])

## Labeling the data splits

In [5]:
ingredients_labeled = pd.read_csv('ingredients_labeled.csv')
vegetarian_ingredients = ingredients_labeled[ingredients_labeled['vegetarian'] == False]
pattern = '|'.join(vegetarian_ingredients['ingredient'])

In [6]:
def label_data(data, vegetarian_ingredients):
    pattern = '|'.join(vegetarian_ingredients['ingredient'])
    data['label'] = np.where(data['ingredients'].str.contains(pattern), 'Vegetarian-Not', 'Vegetarian')
    print(data['label'].value_counts())
    return data
    
data_train_labeled = label_data(data_train, vegetarian_ingredients)
data_test_labeled = label_data(data_test, vegetarian_ingredients)
data_val_labeled = label_data(data_val, vegetarian_ingredients)
data_train_labeled.head()

Vegetarian        1943
Vegetarian-Not    1466
Name: label, dtype: int64
Vegetarian        453
Vegetarian-Not    330
Name: label, dtype: int64
Vegetarian        373
Vegetarian-Not    261
Name: label, dtype: int64


Unnamed: 0,position,ingredients,url,ingredients_cleaned,category,label
0,69,"flour,salt,oil,cold water,apple,sugar,cinnamon...",apple_pie/20_homemade_apple_pie_hostedLargeUrl...,"flour,salt,oil,apple,sugar,cinnamon,butter",apple_pie,Vegetarian
1,91,"shell,pie,sugar,flour,cinnamon,apple,lemon,but...",apple_pie/43_homestyle_apple_pie_hostedLargeUr...,"pie,sugar,flour,cinnamon,apple,lemon,butter,milk",apple_pie,Vegetarian
2,77,"apple,lemon,sugar,flour,cinnamon,nut,butter,sa...",apple_pie/28_apple_pie_hostedLargeUrl.jpg,"apple,lemon,sugar,flour,cinnamon,nut,butter,sa...",apple_pie,Vegetarian
3,88,"pie,apple,sugar,corn starch,cinnamon,lemon,but...",apple_pie/39_classic_apple_pie_hostedLargeUrl.jpg,"pie,apple,sugar,cinnamon,lemon,butter,egg",apple_pie,Vegetarian
4,62,"apple,brown sugar,butter,cinnamon,allspice,nut...",apple_pie/13_mug_apple_pie_hostedLargeUrl.jpg,"apple,brown sugar,butter,cinnamon,allspice,nut...",apple_pie,Vegetarian


In [7]:
label_encoder = LabelEncoder()
data_train['vegetarian'] = label_encoder.fit_transform(data_train['label'])

In [8]:
means = data_train.groupby('category')['vegetarian'].mean()
vegetarian_categories = means[(means < 0.5)]
# x in vegetarian_categories, 'Vegetarian', 'Vegetarian-Not')
data_train['label'] = data_train['category'].apply(lambda x: 'Vegetarian' if x in vegetarian_categories else 'Vegetarian-Not')

In [9]:
wd = os.getcwd() + '/'
def balance_dataset(dataset):
    not_vegetarian_recipes = dataset[dataset['label'] == 'Vegetarian-Not'].copy().sample(frac=1)
    for index, recipe in not_vegetarian_recipes.iterrows():
        value_counts = dataset['label'].value_counts()
        if value_counts[0] == value_counts[1]:
            break

        url = recipe['url']
        image = Image.open(IMAGE_PATH + url)
        image_blur = image.filter(ImageFilter.GaussianBlur)
        split = url.split('/')
        split[1] = 'bl_' + split[1]
        new_url = "/".join(split)
        image_blur.save(wd + IMAGE_PATH + new_url)
        recipe['url'] = new_url
        dataset.loc[len(dataset)] = recipe.tolist()

In [10]:
balance_dataset(data_train)
# balance_dataset(data_test)
# balance_dataset(data_val)
print(data_train['label'].value_counts())

Vegetarian-Not    1907
Vegetarian        1907
Name: label, dtype: int64


## Exporting the labeled data

In [11]:
def export_data(data, split):
    data.to_csv('data/{}_labeled.csv'.format(split), header=True, index=False)

In [12]:
export_data(data_train_labeled, 'train')
export_data(data_test_labeled, 'test')
export_data(data_val_labeled, 'val')

In [13]:
label_encoder = LabelEncoder()
data_train['vegetarian'] = label_encoder.fit_transform(data_train['label'])
means = data_train.groupby('category')['vegetarian'].mean()
means.columns = ['not_vegetarian']
means.to_csv('data/category_not_vegetarian.csv')