In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
import re
from library.sb_utils import save_file

#ignore warning messages to ensure clean outputs
import warnings
warnings.filterwarnings('ignore')

In [3]:
recipes_df = pd.read_csv('../data/RAW_recipes.csv')
users_df = pd.read_csv('../data/RAW_interactions.csv')

In [4]:
recipes_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [5]:
recipes_df.isnull().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [6]:
recipes_df.dtypes

name              object
id                 int64
minutes            int64
contributor_id     int64
submitted         object
tags              object
nutrition         object
n_steps            int64
steps             object
description       object
ingredients       object
n_ingredients      int64
dtype: object

In [47]:
###convert the string nutrition list column into individual float columns
nutrition_df = recipes_df[['id','nutrition']]
nutrition_df[['nutrition_1', 'nutrition_2','nutrition_3','nutrition_4','nutrition_5','nutrition_6','nutrition_7']] = nutrition_df['nutrition'].str.split(',', expand=True)
nutrition_df.drop('nutrition', axis=1, inplace=True)

#remove [] characters from the first and last columns
nutrition_df['nutrition_1'] = nutrition_df['nutrition_1'].str.replace('[', '')
nutrition_df['nutrition_7'] = nutrition_df['nutrition_7'].str.replace(']', '')

#convert all columns to numeric data type
nutrition_df[['nutrition_1', 'nutrition_2','nutrition_3','nutrition_4','nutrition_5','nutrition_6','nutrition_7']] = nutrition_df[['nutrition_1', 'nutrition_2','nutrition_3','nutrition_4','nutrition_5','nutrition_6','nutrition_7']].apply(pd.to_numeric, errors='coerce')
nutrition_df.head()

Unnamed: 0,id,nutrition_1,nutrition_2,nutrition_3,nutrition_4,nutrition_5,nutrition_6,nutrition_7
0,137739,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [8]:
tags_df = recipes_df[['id','tags']]
print(tags_df['tags'][0])

tags_df['is_vegan'] = [int(string.find('vegan') != -1)for string in tags_df['tags']]
tags_df['is_vegetarian'] = [int(string.find('vegetarian') != -1)for string in tags_df['tags']]
tags_df.head()

['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']


Unnamed: 0,id,tags,is_vegan,is_vegetarian
0,137739,"['60-minutes-or-less', 'time-to-make', 'course...",0,1
1,31490,"['30-minutes-or-less', 'time-to-make', 'course...",0,0
2,112140,"['time-to-make', 'course', 'preparation', 'mai...",0,0
3,59389,"['60-minutes-or-less', 'time-to-make', 'course...",0,0
4,44061,"['weeknight', 'time-to-make', 'course', 'main-...",0,1


In [46]:
mapping_table = str.maketrans({'[': '', ']': '', "'": '', '-': ' '})
string = tags_df['tags'][180000].translate(mapping_table)

nlp = spacy.load("en_core_web_sm")
doc = nlp(string)

for ent in doc.ents:
    if(ent.label_ == 'NORP'):
        print(ent.text)

north american
american
