In [1]:
import pandas as pd
import numpy as np

PATH_TO_RECIPES = 'Recipes5k/annotations/'

In [2]:
recipes = pd.read_csv('{}/ingredients_simplified_Recipes5k.txt'.format(PATH_TO_RECIPES), sep="\n", header=None, names=['ingredients'])
def getSplit(split):
    image_urls = pd.read_csv('{}{}_images.txt'.format(PATH_TO_RECIPES, split), sep="\n", header=None, names=['url'])
    image_indices =  pd.read_csv('{}{}_labels.txt'.format(PATH_TO_RECIPES, split), sep="\n", header=None, names=['position'])
    
    recipes_labeled = image_indices.set_index('position').join(recipes).reset_index().join(image_urls)
    return recipes_labeled

data_train = getSplit('train')    
data_test = getSplit('test')
data_val = getSplit('val')

## Labeling the data splits

In [3]:
ingredients_labeled = pd.read_csv('ingredients_labeled.csv')
vegetarian_ingredients = ingredients_labeled[ingredients_labeled['vegetarian'] == False]

In [4]:
def label_data(data, vegetarian_ingredients):
    pattern = '|'.join(vegetarian_ingredients['ingredient'])
    data['label'] = np.where(data['ingredients'].str.contains(pattern), 'Non-Vegetarian', 'Vegetarian')
    print(data['label'].value_counts())
    return data
    
data_train_labeled = label_data(data_train, vegetarian_ingredients)
data_test_labeled = label_data(data_test, vegetarian_ingredients)
data_val_labeled = label_data(data_val, vegetarian_ingredients)
data_train_labeled.head()

Vegetarian        2153
Non-Vegetarian    1256
Name: label, dtype: int64
Vegetarian        492
Non-Vegetarian    291
Name: label, dtype: int64
Vegetarian        401
Non-Vegetarian    233
Name: label, dtype: int64


Unnamed: 0,position,ingredients,url,label
0,69,"flour,salt,oil,cold water,apple,sugar,cinnamon...",apple_pie/20_homemade_apple_pie_hostedLargeUrl...,Vegetarian
1,91,"shell,pie,sugar,flour,cinnamon,apple,lemon,but...",apple_pie/43_homestyle_apple_pie_hostedLargeUr...,Vegetarian
2,77,"apple,lemon,sugar,flour,cinnamon,nut,butter,sa...",apple_pie/28_apple_pie_hostedLargeUrl.jpg,Vegetarian
3,88,"pie,apple,sugar,corn starch,cinnamon,lemon,but...",apple_pie/39_classic_apple_pie_hostedLargeUrl.jpg,Vegetarian
4,62,"apple,brown sugar,butter,cinnamon,allspice,nut...",apple_pie/13_mug_apple_pie_hostedLargeUrl.jpg,Vegetarian


## Exporting the labeled data

In [5]:
def export_data(data, split):
    data.to_csv('data/{}_labeled.csv'.format(split), header=True, index=False)

In [6]:
export_data(data_train_labeled, 'train')
export_data(data_test_labeled, 'test')
export_data(data_val_labeled, 'val')