In [3]:
import re
import pandas as pd
# import swifter
import numpy as np
import spacy
import pickle

ModuleNotFoundError: No module named 'spacy'

In [2]:
nlp = spacy.load('en_core_web_sm')
df = pd.read_csv('../data/my-ingredients.csv')

In [3]:
def change_meta_to_comma_separated(s):
    s = s.replace('[', '')
    s = s.replace(']', '')
    s = s.replace('"', '')
    return s

df['meta'] = df['meta'].apply(change_meta_to_comma_separated)

In [12]:
df.iloc[50:,:].head(50)
# df.info()

Unnamed: 0,name,original,amount,unit,meta
50,pepper,1/2 teaspoon pepper,0.5,teaspoon,
51,phyllo dough,8 sheets phyllo dough,8.0,sheets,
52,ricotta cheese,1 pound ricotta cheese,1.0,pound,
53,salt,1/2 teaspoon salt,0.5,teaspoon,
54,spinach,"2 pounds spinach, chopped and squeezed dry",2.0,pounds,"dry,chopped"
55,frozen spinach,"1 (10 ounce) package frozen chopped spinach, t...",10.0,ounce,"frozen,thawed,chopped"
56,garlic,"2 cloves garlic, minced",2.0,cloves,minced
57,ground cumin,1 teaspoon ground cumin,1.0,teaspoon,
58,mayonnaise,1/2 cup mayonnaise,0.5,cup,
59,onion,1 cup minced onion,1.0,cup,minced


In [13]:
# Helpers
def is_punctuation(s):
    return s in ('(', ')', 
                 '[', ']',
                 '!', '.', 
                 ',', '&',
                 '*')
def is_number(s):
    try:
        float(s)
        return True
    except:
        return False

def fractions_to_floats(line):
    """ Change all fractions to floats """
    has_two_fractions = re.compile(r'(\d+)\s+(\d+)/(\d+)') # i.e. 1 1/8 => 1.13
    has_one_fraction = re.compile(r'(\d+)/(\d+)')          # i.e. 1/2   => 1.5
    one_number = re.compile(r'\d+')                        # i.e. 16, 2 => 16.0, 2.0

    def has_two_handler(n):
        fraction = float(n.group(2)) / float(n.group(3))
        num = float(n.group(1)) + fraction
        return str(round(num, 2))

    def has_one_handler(n):
        fraction = float(n.group(1)) / float(n.group(2))
        return str(round(fraction, 2))

    def one_number_handler(n):
        return str(n.group(0) + '.0')

    if has_two_fractions.search(line):
        return has_two_fractions.sub(has_two_handler, line)
    elif has_one_fraction.search(line):
        return has_one_fraction.sub(has_one_handler, line)
    elif one_number.search(line):
        return one_number.sub(one_number_handler, line)
    else:
        return line

In [31]:
def process_original_string(string):
    doc = nlp(string)
    valid_words = [token for token in doc if not token.is_stop]
    lemma = [0] * len(valid_words)
    pos = [0] * len(valid_words)
    tag = [0] * len(valid_words)
    is_alpha = [0] * len(valid_words)
    is_num = [0] * len(valid_words)
    for i,token in enumerate(valid_words):
        lemma[i] = token.lemma_
        pos[i] = token.pos_
        tag[i] = token.tag_
        is_alpha[i] = token.is_alpha
        is_num[i] = is_number(token.lemma_)
    return {
        'lemma': lemma,
        'pos': pos,
        'tag': tag,
        'is_alpha': is_alpha,
        'is_num': is_num,
    }

def get_lemma(s):
    if not s or s is None or type(s) != str:
        return []
    return [token.lemma_ for token in nlp(s)]

def get_labels(row, processed_original):
    name = get_lemma(row['name'])
    amount = row['amount']
    unit = get_lemma(row['unit'])
    meta = get_lemma(row['meta'])
    # Assemble sent for each word by adding label
    labels = [0] * len(processed_original['lemma'])
    for i,word in enumerate(processed_original['lemma']):
        if is_punctuation(word):
            labels[i] = 'PUNCTUATION'
        elif word in name:
            labels[i] = 'NAME'
        elif is_number(word) and float(word) == float(amount):
            labels[i] = 'QUANTITY'
        elif word in unit:
            labels[i] = 'UNIT'
        elif word in meta:
            labels[i] = 'COMMENT'
        else:
            labels[i] = 'OTHER'
    return labels
    
def get_sents(row):
    processed_row = process_original_string(fractions_to_floats(row['original']))
    labels = get_labels(row, processed_row)
    processed_row['label'] = labels
    return processed_row

In [32]:
sents = df.swifter.apply(get_sents, axis=1)
sents.to_pickle('../data/processed-my-ingredients.pickle')

Pandas Apply: 100%|██████████| 115315/115315 [1:05:30<00:00, 29.34it/s]


In [1]:
sents = pd.read_pickle('../data/processed-my-ingredients.pickle')

# Sents to Features
def sents_to_features(sents):
    features = []
    labels = []
    for token in sents:
        curr_feature = []
        curr_labels = []
        for i in range(len(token['lemma'])):
            curr_feature.append({
                'word': token['lemma'][i],
                'pos': token['pos'][i],
                'tag': token['tag'][i],
                'is_alpha': token['is_alpha'][i],
                'is_num': token['is_num'][i],
                'prev_pos': token['pos'][i-1] if i-1 >= 0 else '',
                'prev_tag': token['tag'][i-1] if i-1 >= 0 else '',
                'prev_num': token['is_num'][i-1] if i-1 >= 0 else '',
                'next_pos': token['pos'][i+1] if i+1 < len(token) else '',
                'next_tag': token['tag'][i+1] if i+1 < len(token) else '',
                'next_num': token['is_num'][i+1] if i+1 < len(token) else '',
            })
            curr_labels.append(token['label'][i])
        features.append(curr_features)
        labels.append(curr_labels)
    return features, labels

final_data = sents_to_features(sents)
pickle.dump(final_data, '../data/features-my-ingredients.pickle')

NameError: name 'pd' is not defined