In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [109]:
dish_ds = pd.read_csv('3A2M_EXTENDED.csv')

In [110]:
dish_ds.head()

Unnamed: 0,title,NER,Extended_NER,genre,label,directions
0,\t Arugula Pomegranate Salad,"[""baby spinach"", ""baby arugula"", ""pomegranate ...","['alfalfa sprouts', 'baby spinach', 'baby arug...",vegetables,4,"[""Toss together spinach and arugula, then plac..."
1,\t Black Bean And Turkey Chili,"[""olive oil"", ""yellow onion"", ""garlic"", ""groun...","['one', 'yellow onion', 'tomato paste', 'about...",sides,8,"[""Dice the onion and mince the garlic. Add the..."
2,\t Finger Lickin' Tofu Nuggets,"[""extra firm"", ""almond flour"", ""nutritional ye...","['extra firm', '2', 'coconut oil', 'almond flo...",nonveg,3,"[""Wrap the tofu in a clean tea towel and press..."
3,\t Jerk Beef Stew With Carrots And Tomatoes,"[""olive oil"", ""boneless beef chuck"", ""onion"", ...","['boneless beef chuck', '2', 'Saute', 'onion',...",vegetables,4,"[""Preheat oven to 350 degrees F."", ""Heat the o..."
4,\t Pomegranate Couscous Salad,"[""pomegranate arils"", ""whole wheat couscous"", ...","['whole wheat couscous', '10 minutes', 'lemon ...",vegetables,4,"[""Place couscous in a bowl with 11/2 cups of h..."


In [111]:
def preprocess_ingredients(row):
    ingredients = ast.literal_eval(row['Extended_NER'])
    second_ingredients = ast.literal_eval(row['NER'])
    return str(ingredients + second_ingredients)

In [112]:
dish_ds['ingredients'] = dish_ds.apply(preprocess_ingredients, axis=1)

In [113]:
dish_ds_ner_combined = dish_ds.drop(['NER', 'Extended_NER'], axis=1)

In [114]:
dish_ds_ner_combined.head()

Unnamed: 0,title,genre,label,directions,ingredients
0,\t Arugula Pomegranate Salad,vegetables,4,"[""Toss together spinach and arugula, then plac...","['alfalfa sprouts', 'baby spinach', 'baby arug..."
1,\t Black Bean And Turkey Chili,sides,8,"[""Dice the onion and mince the garlic. Add the...","['one', 'yellow onion', 'tomato paste', 'about..."
2,\t Finger Lickin' Tofu Nuggets,nonveg,3,"[""Wrap the tofu in a clean tea towel and press...","['extra firm', '2', 'coconut oil', 'almond flo..."
3,\t Jerk Beef Stew With Carrots And Tomatoes,vegetables,4,"[""Preheat oven to 350 degrees F."", ""Heat the o...","['boneless beef chuck', '2', 'Saute', 'onion',..."
4,\t Pomegranate Couscous Salad,vegetables,4,"[""Place couscous in a bowl with 11/2 cups of h...","['whole wheat couscous', '10 minutes', 'lemon ..."


In [115]:
available_ingredients = [
    "22",
    "pig",
    "garlic",
    "calamansi",
    "tomato",
    "bell pepper",
    "carrot",
    "ginger",
    "chicken",
    "potato",
    "cucumber",
    "onion",
    "chili",
    "long chili",
]

def filter_ingredients(row):
    ingredients = ast.literal_eval(row['ingredients'])
    ingredients = set([ingredient.lower().strip() for ingredient in ingredients])
    return not ingredients.isdisjoint(available_ingredients)

In [116]:
mask = dish_ds_ner_combined.apply(filter_ingredients, axis=1)

In [117]:
mask.value_counts()

False    1456961
True      774182
Name: count, dtype: int64

In [118]:
dish_ds_filtered = dish_ds_ner_combined[mask]

In [119]:
dish_ds_filtered.head()

Unnamed: 0,title,genre,label,directions,ingredients
1,\t Black Bean And Turkey Chili,sides,8,"[""Dice the onion and mince the garlic. Add the...","['one', 'yellow onion', 'tomato paste', 'about..."
2,\t Finger Lickin' Tofu Nuggets,nonveg,3,"[""Wrap the tofu in a clean tea towel and press...","['extra firm', '2', 'coconut oil', 'almond flo..."
3,\t Jerk Beef Stew With Carrots And Tomatoes,vegetables,4,"[""Preheat oven to 350 degrees F."", ""Heat the o...","['boneless beef chuck', '2', 'Saute', 'onion',..."
15,Sparkrecipes Is Powered By Sparkpeople.Com T...,cereal,6,"[""Portion the turkey as soon as you get home f...","['Place', 'egg whites', 'onion', 'chedder', 'g..."
16,(Cheesy Mashed Potatoes) Recipe,vegetables,4,"[""In a large saucepan, cover potatoes and 1 wh...","['Bring', 'mixed Alpine cheeses', 'about 20 mi..."


In [120]:
def transform_list_to_string(row):
    return ';'.join(ast.literal_eval(row['ingredients']))

In [121]:
dish_ds_filtered['ingredients'] = dish_ds_filtered.apply(transform_list_to_string, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dish_ds_filtered['ingredients'] = dish_ds_filtered.apply(transform_list_to_string, axis=1)


In [122]:
dish_ds_filtered.head()

Unnamed: 0,title,genre,label,directions,ingredients
1,\t Black Bean And Turkey Chili,sides,8,"[""Dice the onion and mince the garlic. Add the...",one;yellow onion;tomato paste;about 10 minutes...
2,\t Finger Lickin' Tofu Nuggets,nonveg,3,"[""Wrap the tofu in a clean tea towel and press...",extra firm;2;coconut oil;almond flour;nutritio...
3,\t Jerk Beef Stew With Carrots And Tomatoes,vegetables,4,"[""Preheat oven to 350 degrees F."", ""Heat the o...",boneless beef chuck;2;Saute;onion;1 hour to 1 ...
15,Sparkrecipes Is Powered By Sparkpeople.Com T...,cereal,6,"[""Portion the turkey as soon as you get home f...",Place;egg whites;onion;chedder;ground turkey;B...
16,(Cheesy Mashed Potatoes) Recipe,vegetables,4,"[""In a large saucepan, cover potatoes and 1 wh...",Bring;mixed Alpine cheeses;about 20 minutes;th...


In [123]:
genres = dish_ds_filtered['genre'].unique()
def sample_genre(genre_df):
    return genre_df.sample(n=2000, random_state=9)
genres

array(['sides', 'nonveg', 'vegetables', 'cereal', 'drinks', 'fusion',
       'fastfood', 'bakery', 'meal', 'Fusion'], dtype=object)

In [124]:
# Take 2000 of each genre and put it into a new df
dish_ds_filtered_sampled = dish_ds_filtered.groupby('genre').apply(sample_genre).reset_index(drop=True)
dish_ds_filtered_sampled.head()

  dish_ds_filtered_sampled = dish_ds_filtered.groupby('genre').apply(sample_genre).reset_index(drop=True)


Unnamed: 0,title,genre,label,directions,ingredients
0,Stuffed Shells Primavera(Serves 6),Fusion,9,Simmer tomatoes oil basil garlic salt and pepp...,zucchini;Ricotta;butter;Season;Arrange;Parmesa...
1,Hodgepodge,Fusion,9,Combine all ingredients in large pot. Cook ove...,15 to 20 minutes;Add;onion;pork;14;minestrone ...
2,Fricadellen,Fusion,9,"[""Combine beef, onion, bread, 1/2 cup of water...",ground nutmeg;1/4 cup;Combine;mushroom gravy;w...
3,Oyster Stuffing,Fusion,9,"[""Heat oysters in"", ""own"", ""liquid,"", ""drain.""...",butter;1/2 cup;water;sat;oysters;pepper;bread ...
4,Individual Scout Pack,Fusion,9,On a long piece of heavy aluminum foil place o...,onion;Worcestershire sauce;one half;green pepp...


In [125]:
dish_ds_filtered_sampled['genre'].value_counts()

genre
Fusion        2000
bakery        2000
cereal        2000
drinks        2000
fastfood      2000
fusion        2000
meal          2000
nonveg        2000
sides         2000
vegetables    2000
Name: count, dtype: int64

In [127]:
dish_ds_filtered_sampled.to_csv('3A2M_EXTENDED_FILTERED_SAMPLED.csv', index=False)
dish_ds_filtered_sampled.to_json('3A2M_EXTENDED_FILTERED_SAMPLED.json', orient='records')