In [1]:
import pandas as pd
import dask.dataframe as dd
import json
import unicodedata

# Unicode characters preprocessing for NER dataset (previously not preprocessed in this manner)
Copied from `02-dataset-tuning`

In [2]:
def trim_and_remove_empty_lines(inputList: list):
    result = list(map(lambda x: x.strip(), inputList))
    return list(filter(lambda x: len(x) > 0, result))

In [3]:
replace_list = [
    ('\u2044', '/'),('\u00b0', 'DEGREE_SIGN'),('\u00bd', '1/2'),('\u00bc', '1/4'),
    ('\u00be', '3/4'),('\u00e9', 'e'),('\u2019', "'"),('\u00ae', '(R)'),
    ('\u2013', '-'),('\u00ba', 'DEGREE_SIGN'),('\u00a0', ' '),('\u00f1', 'n'),
    ('\u2153', '1/3'),('\u2152', '1/10'),('\u2151', '1/9'),('\u2150', '1/7'),
    ('\u2154', '2/3'),('\u2155', '1/5'),('\u2156', '2/5'),('\u2157', '3/5'),
    ('\u2158', '4/5'),('\u2159', '1/6'),('\u215a', '5/6'),('\u215b', '1/8'),
    ('\u215c', '3/8'),('\u215d', '5/8'),('\u215e', '7/8'),('\u201d', '"'),
    ('\u2014', '-'),('\u00e8', 'e'),('\u201c', '"'),('\u2033', '"'),
    ('\u2022', ''),('\u00d7', 'x'),('\u00ee', 'i'),('\u2026', '...'),
    ('\u00c2', 'A'),('\u2122', 'TM'),('\u2018', "'"),('\u00ad', '-'),('\u2028', ' '),
]

In [4]:
def replacer(arg: str):
    for k, v in replace_list:
        arg = arg.replace(k, v)
    return arg


def parse_line(line: str):
    line = replacer(line)
    if not line.isascii():
        line = unicodedata.normalize('NFKD', line).encode('ascii','ignore').decode("utf-8")
    # bring back degree sign
    # extra remove additional spaces
    line = line.replace('\t', ' ').replace('\n', ' ')
    while '  ' in line:
        line = line.replace('  ', ' ')
    return line.replace('DEGREE_SIGN', '\u00b0')


def parse_list_of_lines(lst: list):
    return list(map(parse_line, lst))

In [5]:
ner = pd.read_csv('0-datasets/ner-dataset.csv').drop(columns=['directions', 'Unnamed: 0', 'Unnamed: 0.1'])

In [6]:
ner.ingredients = ner.ingredients.map(json.loads).map(trim_and_remove_empty_lines).map(parse_list_of_lines).map(json.dumps)
ner.head()

Unnamed: 0,ingredients,title,NER
0,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...",No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,"[""1 small jar chipped beef, cut up"", ""4 boned ...",Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...",Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...",Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,"[""1 c. peanut butter"", ""3/4 c. graham cracker ...",Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [7]:
ing_ner = ner.groupby('ingredients').NER.apply(set)

In [8]:
ing_ner.shape

(2435938,)

In [9]:
ing_ner.head()

ingredients
[" (0.80kg/1.75lb) fingerling potatoes", " (0.34kg/0.75lb) brussels sprouts", " 3 garlic cloves, minced", " 2 tbsp fresh minced rosemary", " 1 tbsp + 1 tsp extra virgin olive oil", " 1/2 tbsp Sucanat (optional)", " 3/4 tsp fine grain sea salt", " Freshly ground black pepper", " 1/4 tsp red pepper flakes (optional)"]                                                                                                                                                                                                {["fingerling potatoes", "brussels sprouts", "...
[" (6) Jumbo Scallops", " (9) bacon strips", " (6) frozen shrimp, chopped and de-tailed", " (2) Cloves Garlic, crushed;", " 1/4 c. Brown Mushrooms, sliced;", " Jarlsburg cheese (shredded)", " Mozzarella cheese (shredded)", " 1 tsp. Tarragon", " 1/4 c. Fresh Parsley (minced)", " 1 1/2 T. Thyme", " 1/2 tsp. Garlic Chili paste", " 3/4 c. milk or light cream", " 1/2 c. white wine", " 1/4 c. lemon juice", " 1/3 c. red onion (chopp

In [10]:
ing_ner.map(lambda x: len(x) == 1).all()

False

In [11]:
ing_ner.apply(list).map(lambda x: len(x) == 1).all()

False

In [12]:
ing_ner = ing_ner.apply(list).map(lambda x: x[0])
ing_ner.head()

ingredients
[" (0.80kg/1.75lb) fingerling potatoes", " (0.34kg/0.75lb) brussels sprouts", " 3 garlic cloves, minced", " 2 tbsp fresh minced rosemary", " 1 tbsp + 1 tsp extra virgin olive oil", " 1/2 tbsp Sucanat (optional)", " 3/4 tsp fine grain sea salt", " Freshly ground black pepper", " 1/4 tsp red pepper flakes (optional)"]                                                                                                                                                                                                ["fingerling potatoes", "brussels sprouts", "g...
[" (6) Jumbo Scallops", " (9) bacon strips", " (6) frozen shrimp, chopped and de-tailed", " (2) Cloves Garlic, crushed;", " 1/4 c. Brown Mushrooms, sliced;", " Jarlsburg cheese (shredded)", " Mozzarella cheese (shredded)", " 1 tsp. Tarragon", " 1/4 c. Fresh Parsley (minced)", " 1 1/2 T. Thyme", " 1/2 tsp. Garlic Chili paste", " 3/4 c. milk or light cream", " 1/2 c. white wine", " 1/4 c. lemon juice", " 1/3 c. red onion (chopp

In [13]:
ing_ner.iloc[0]

'["fingerling potatoes", "brussels sprouts", "garlic", "rosemary", "+", "salt", "ground black pepper", "red pepper"]'

In [14]:
core = pd.read_csv('0-datasets/v-1.1.1-single-h.csv')
core.shape

(2231142, 5)

In [15]:
df = core.join(ing_ner, on="ingredients", rsuffix="_ing")

In [16]:
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [17]:
df.describe()

Unnamed: 0,title,ingredients,directions,link,source,NER
count,2231142,2231142,2231142,2231142,2231142,2196172
unique,1312871,2226362,2211644,2231142,2,2102226
top,Chicken Casserole,"[""1 c. peanut butter"", ""1 c. sugar"", ""1 egg""]","[""Mix all ingredients together.""]",www.cookbooks.com/Recipe-Details.aspx?id=357269,Gathered,"[""sugar"", ""water""]"
freq,4099,28,274,1,1643098,162


In [18]:
df.shape

(2231142, 6)

In [19]:
df.to_csv('0-datasets/v-1.1.1-single-h-NER-incomplete.csv')