# Imports

In [1]:
import pandas as pd
import pickle
%matplotlib inline

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

import string
from unidecode import unidecode
import re

from cleaning_words import read_common_words
from cleaning_words import transform_common_words
from cleaning_words import clean_single_phrase
from cleaning_words import remove_from_single_phrase
from cleaning_words import convert_list_from_dict
from cleaning_words import make_clean_keywords
from cleaning_words import clean_keyword

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Constants

## Load:

In [None]:
RECIPES_BASE = "../Data/base/recipes_base.parquet"
RECIPES_DESCRIPTIVE = "../Data/base/recipes_descriptive.parquet"

COMMON_WORDS_OLD = '../words.txt'

## Save:

In [6]:
COMMON_WORDS = '../cleaned_files/common_words.txt'

ING_CLEAN = '../cleaned_files/ingredients_clean.obj'
ING_CLEAN_NO_COMMON = '../cleaned_files/ingredients_clean_without_common_words.obj'
KEYWORDS_CLEAN = '../cleaned_files/keywords_cleaned.obj'
CATEGORIES_CLEAN = '../cleaned_files/categories_cleaned.obj'
NAMES_CLEAN = '../cleaned_files/names_cleaned.obj'
NAMES_CLEAN_NO_COMMON = '../cleaned_files/names_clean_no_common.obj'

ING2CLEAN_DICT = "../cleaned_files/ingredient_2_clean_dict.obj"
ING2CLEAN_NO_COMMON_DICT = "../cleaned_files/ingredient_2_clean_no_common_dict.obj"
KEY2CLEAN_DICT = "../cleaned_files/keywords_2_clean_dict.obj"
CAT2CLEAN_DICT = "../cleaned_files/category_2_clean_dict.obj"
NAME2CLEAN_DICT = "../cleaned_files/name_2_clean_dict.obj"
NAME2CLEAN_NO_COMMON_DICT = "../cleaned_files/name_2_clean_no_common_dict.obj"

# Load data

In [8]:
recipes_base = pd.read_parquet(RECIPES_BASE)
recipes_desc = pd.read_parquet(RECIPES_DESCRIPTIVE)

In [9]:
recipe_ids = pd.DataFrame(recipes_base['RecipeId'])

# Ingredients inspection

In [59]:
all_ingredients = []
for ing_list in recipes_desc.Ingredients.tolist():
    for ing in ing_list:
        all_ingredients.append(ing)
all_ingredients_unique = set(all_ingredients)

In [60]:
print('Number of unique ingredients: ', len(all_ingredients_unique))

Number of unique ingredients:  7368


# Common words

## Read common words

In [61]:
common_words = read_common_words(COMMON_WORDS_OLD)
common_words = transform_common_words(common_words)
print('Number of common words to remove: ', len(common_words))

Number of common words to remove:  451


## Save unique common words

In [62]:
# with open(COMMON_WORDS, 'w') as f:
#     f.writelines('\n'.join(common_words))

# Clean ingredients

## Create cleaned ingredients' dictionaries

### Ingredient to clean ingredient
    Key:   Ingredient
    Value: Clean ingredient

In [63]:
ingredient_2_clean_ingredient_dict = {}
for ingredient in all_ingredients_unique:
    ingredient_2_clean_ingredient_dict[ingredient] = clean_single_phrase(ingredient)

### Ingredient to clean ingredient without common words
    Key:   Ingredient
    Value: Clean ingredient without common words

In [65]:
ingredient_2_clean_no_common_dict = {}
for ingredient in all_ingredients_unique:
    ingredient_2_clean_no_common_dict[ingredient] = remove_from_single_phrase(ingredient_2_clean_ingredient_dict[ingredient], 
                                                                              common_words)

## Transform ingredients' lists

### Clean ingredients dataframe

In [17]:
clean_ingredients = recipes_desc['Ingredients'].apply(convert_list_from_dict, dictionary_phrases=ingredient_2_clean_ingredient_dict)

### Unique clean ingredients 

In [18]:
all_clean_ingredients = []
for ing_list in clean_ingredients.tolist():
    for ing in ing_list:
        all_clean_ingredients.append(ing)
all_clean_ingredients_unique = set(all_clean_ingredients)

In [19]:
print('Number of unique ingredients: ', len(all_clean_ingredients_unique))

Number of unique ingredients:  6463


In [20]:
clean_ingredients

0         [blueberry, granulate sugar, lemon juice, vani...
1         [basmati rice, boneless chicken, cardamom seed...
2         [fresh lemon juice, fresh water, lemon rind, l...
3         [black pepper, eggplant, extra firm tofu, garl...
4         [cabbage, carrot, celery, onion, plain tomato ...
                                ...                        
522512    [bake soda, buttermilk, cinnamon, dark brown s...
522513    [brandy, cognac, dijon mustard, garlic, pepper...
522514              [brandy, half half, heavy cream, sugar]
522515    [dill, english cucumber, pickled ginger, smoke...
522516    [breakfast sausage, flour, hard boil egg, pank...
Name: Ingredients, Length: 522517, dtype: object

### Remove common words -> dataframe

In [26]:
clean_ingredients_withot_common = recipes_desc['Ingredients'].apply(convert_list_from_dict, 
                                                                     dictionary_phrases=ingredient_2_clean_no_common_dict)

### Unique clean ingredients without common words

In [22]:
all_ingredients = []
for ing_list in clean_ingredients_withot_common.tolist():
    for ing in ing_list:
        all_ingredients.append(ing)
all_ingredients_unique = set(all_ingredients)

In [23]:
print('Number of unique ingredients after cleaning: ', len(all_ingredients_unique))

Number of unique ingredients after cleaning:  5070


In [24]:
clean_ingredents = recipe_ids.merge(pd.DataFrame(clean_ingredients), left_index=True, right_index=True)

In [25]:
clean_ingredients_withot_common = recipe_ids.merge(pd.DataFrame(clean_ingredients_withot_common), left_index=True, right_index=True)

## Save

In [82]:
# with open(ING_CLEAN, 'wb') as pickle_file:
#     pickle.dump(clean_ingredients, pickle_file)

In [83]:
# with open(ING_CLEAN_NO_COMMON, 'wb') as pickle_file:
#     pickle.dump(clean_ingredients_withot_common, pickle_file)

# Recipes keywords

## Unique keywords

In [67]:
all_keywords = []
for keywords in recipes_desc.Keywords.tolist():
    for key in keywords:
        all_keywords.append(key)

In [68]:
all_keywords_unique = set(all_keywords)

In [69]:
print('Number of unique keywords: ', len(all_keywords_unique))

Number of unique keywords:  315


## Create keyword dictionary

In [70]:
keywords_2_cleaned_dict = {}
for key in all_keywords_unique:
    keywords_2_cleaned_dict[key] = clean_keyword(key)

## Clean keywords -> dataframe

In [69]:
cleaned_keywords = recipes_desc.Keywords.apply(convert_list_from_dict, 
                                                dictionary_phrases=keywords_2_cleaned_dict)

In [70]:
cleaned_keywords = recipe_ids.merge(pd.DataFrame(cleaned_keywords), left_index=True, right_index=True)

# Recipes categories

## Unique categories

In [72]:
print('Number of unique categories: ', recipes_desc.RecipeCategory.nunique())

Number of unique categories:  311


In [76]:
all_categories_unique = recipes_desc.RecipeCategory.unique()

## Create categories dictionary

In [77]:
category_2_cleaned_dict = {}
for cat in all_categories_unique:
    category_2_cleaned_dict[cat] = clean_keyword(cat)

## Clean recipe categories

In [75]:
clean_recipes_categories = recipes_desc.RecipeCategory.apply(lambda x: category_2_cleaned_dict[x])

In [76]:
clean_recipes_categories = recipe_ids.merge(pd.DataFrame(clean_recipes_categories), left_index=True, right_index=True)

In [81]:
# with open(CATEGORIES_CLEAN, 'wb') as pickle_file:
#     pickle.dump(clean_recipes_categories, pickle_file)

# Names

## Create name dictionary

In [71]:
all_names = list(recipes_base.Name)
all_names_unique = set(all_names)
name_2_cleaned_dict = {}
for name in all_names_unique:
    name_2_cleaned_dict[name] = clean_single_phrase(name)

## Clean names

In [46]:
clean_names = recipes_base.Name.apply(lambda x: name_2_cleaned[x])

In [47]:
clean_names = recipe_ids.merge(pd.DataFrame(clean_names), left_index=True, right_index=True)

In [39]:
clean_names_unique = set(name_2_cleaned.items())

## Create name dictionary without common words

In [40]:
name_2_clean_no_common = {}
for name in name_2_cleaned_dict.keys():
    name_2_clean_no_common[name] = remove_from_single_phrase(name_2_cleaned_dict[name], common_words)

## Clean names without common words

In [48]:
clean_names_no_common = recipes_base.Name.apply(lambda x: name_2_clean_no_common[x])
clean_names_no_common = recipe_ids.merge(pd.DataFrame(clean_names_no_common), left_index=True, right_index=True)

## Names with duplicates

In [9]:
recipes_base[recipes_base.Name.isin(recipes_base.Name[recipes_base.Name.duplicated()])]

Unnamed: 0,RecipeId,Name,AuthorId,CookTimeInMinutes,PrepTimeInMinutes,TotalTimeInMinutes,DatePublished,Description,RecipeCategory,Keywords,Ingredients,RecipeServings,RecipeInstructions,Nutritions
1,39,Biryani,1567,25.0,240.0,265.0,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[saffron, milk, hot green chili peppers, onion...",6.0,[Soak saffron in warm milk for 5 minutes and p...,"[58.8, 16.6, 372.8, 368.4, 84.4, 9.0, 20.4, 63.4]"
2,40,Best Lemonade,1566,5.0,30.0,35.0,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...","[sugar, lemons, rind of, lemon, zest of, fresh...",4.0,"[Into a 1 quart Jar with tight fitting lid, pu...","[0.2, 0.0, 0.0, 1.8, 81.5, 0.4, 77.2, 0.3]"
4,42,Cabbage Soup,1538,30.0,20.0,50.0,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[plain tomato juice, cabbage, onion, carrots, ...",4.0,"[Mix everything together and bring to a boil.,...","[0.4, 0.1, 0.0, 959.3, 25.1, 4.8, 17.7, 4.3]"
9,47,Butter Pecan Cookies,1573,9.0,55.0,64.0,1999-09-07 09:01:00+00:00,Make and share this Butter Pecan Cookies recip...,Dessert,"[Cookie & Brownie, Fruit, Nuts, Weeknight, Ove...","[butter, brown sugar, granulated sugar, vanill...",,"[Preheat oven to 350 degrees., Cream butter in...","[5.6, 1.4, 6.3, 15.0, 4.5, 0.6, 1.6, 0.8]"
10,48,Boston Cream Pie,1545,0.0,135.0,135.0,1999-08-24 04:35:00+00:00,Make and share this Boston Cream Pie recipe fr...,Pie,"[Dessert, Weeknight, Oven, < 4 Hours]","[margarine, cake flour, baking powder, salt, s...",8.0,"[Beat egg whites until soft peaks form., Gradu...","[36.4, 12.9, 105.9, 722.3, 84.0, 1.6, 46.2, 8.8]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522413,541278,Sherried Artichoke Chicken,298447,25.0,25.0,50.0,2020-11-30 17:58:00+00:00,Make and share this Sherried Artichoke Chicken...,Chicken Breast,"[Chicken, Poultry, Meat, Healthy, < 60 Mins]","[paprika, salt, pepper, boneless skinless chic...",,"[Combine the paprika, salt & pepper; sprinkle ...","[9.8, 4.5, 90.8, 560.5, 18.4, 9.3, 2.5, 31.4]"
522431,541298,Homemade Ketchup,2002848998,0.0,5.0,5.0,2020-11-30 18:03:00+00:00,Homemade ketchup is so easy and is made with j...,< 15 Mins,[Easy],"[tomato paste, white sugar, cold water, allspi...",3.0,"[Whisk all ingredients together in a bowl., En...","[0.7, 0.1, 0.0, 1812.3, 38.5, 5.8, 28.6, 5.8]"
522451,541318,Peppermint Hot Chocolate,2002835852,0.0,5.0,5.0,2020-12-07 13:54:00+00:00,Make and share this Peppermint Hot Chocolate r...,Beverages,"[Kid Friendly, < 15 Mins, Easy]","[milk, sugar, salt]",1.0,"[Mixed cocoa powder, sugar, salt, and 1/2 of t...","[10.9, 6.7, 34.2, 277.9, 44.7, 4.8, 25.2, 10.8]"
522460,541327,Eggplant Casserole,48920,30.0,45.0,75.0,2020-12-07 19:03:00+00:00,Make and share this Eggplant Casserole recipe ...,< 4 Hours,[None],"[lean ground beef, onion, bell pepper, eggplan...",6.0,"[In a large skillet saute ground beef, onion, ...","[20.0, 10.6, 124.0, 393.5, 10.6, 3.5, 4.4, 28.1]"


## Save files

In [34]:
# with open(NAMES_CLEAN, 'wb') as pickle_file:
#     pickle.dump(clean_names, pickle_file)

In [51]:
# with open(NAMES_CLEAN_NO_COMMON, 'wb') as pickle_file:
#     pickle.dump(clean_names_no_common, pickle_file)

# Save dictionaries

In [78]:
# with open(ING2CLEAN_DICT, 'wb') as pickle_file:
#     pickle.dump(ingredient_2_clean_ingredient_dict, pickle_file)
    
# with open(ING2CLEAN_NO_COMMON_DICT, 'wb') as pickle_file:
#     pickle.dump(ingredient_2_clean_no_common_dict, pickle_file)
    
# with open(KEY2CLEAN_DICT, 'wb') as pickle_file:
#     pickle.dump(keywords_2_cleaned, pickle_file)
    
# with open(CAT2CLEAN_DICT, 'wb') as pickle_file:
#     pickle.dump(category_2_cleaned_dict, pickle_file)
    
# with open(NAME2CLEAN_DICT, 'wb') as pickle_file:
#     pickle.dump(name_2_cleaned, pickle_file)
    
# with open(NAME2CLEAN_NO_COMMON_DICT, 'wb') as pickle_file:
#     pickle.dump(name_2_clean_no_common, pickle_file)