In [1]:
import string
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [132]:
def remove_non_verb(text):
    pos_tagged = nltk.pos_tag(text)
    print(pos_tagged)
    verbs = [pos_tagged[0][0]] + [i[0] for i in pos_tagged[1:] if 'VB' in i[1]] # always include 1st word as nltk doesnt perform well here
    return verbs

In [142]:
remove_non_verb(nltk.word_tokenize("hit and simmer"))

[('hit', 'NN'), ('and', 'CC'), ('simmer', 'NN')]


['hit']

In [122]:
wn.synsets('simmer')

[Synset('simmer.n.01'), Synset('simmer.v.01')]

In [128]:
txt2 = "['mix water and flour, divide evenly between two: avocado, avocado, avocado's constant. preheat oven to 2000f', 'preheat the oven to 350f']"
text = "['bring to a boil, cover and simmer gently for 8 to 10 minutes, until soft and pulpy']"
x = re.findall("[\w|\s]+", text)
print(x)
[print(i) for i in x if any([j.isalnum() for j in i.split()])]

['bring to a boil', ' cover and simmer gently for 8 to 10 minutes', ' until soft and pulpy']
bring to a boil
 cover and simmer gently for 8 to 10 minutes
 until soft and pulpy


[None, None, None]

In [138]:
def convert_list(obj):
    """Convert dataframe object(string) to processable list"""
    return [i for i in re.findall("[\w\s]+", obj) if any([j.isalnum() for j in i.split()])]

In [139]:
print(convert_list(text))
steps = [nltk.word_tokenize(i) for i in convert_list(text)]
print(steps)

['bring to a boil', ' cover and simmer gently for 8 to 10 minutes', ' until soft and pulpy']
[['bring', 'to', 'a', 'boil'], ['cover', 'and', 'simmer', 'gently', 'for', '8', 'to', '10', 'minutes'], ['until', 'soft', 'and', 'pulpy']]


In [140]:
[remove_non_verb(step) for step in steps if remove_non_verb(step)]

[('bring', 'NN'), ('to', 'TO'), ('a', 'DT'), ('boil', 'NN')]
[('bring', 'NN'), ('to', 'TO'), ('a', 'DT'), ('boil', 'NN')]
[('cover', 'NN'), ('and', 'CC'), ('simmer', 'NN'), ('gently', 'RB'), ('for', 'IN'), ('8', 'CD'), ('to', 'TO'), ('10', 'CD'), ('minutes', 'NNS')]
[('cover', 'NN'), ('and', 'CC'), ('simmer', 'NN'), ('gently', 'RB'), ('for', 'IN'), ('8', 'CD'), ('to', 'TO'), ('10', 'CD'), ('minutes', 'NNS')]
[('until', 'IN'), ('soft', 'JJ'), ('and', 'CC'), ('pulpy', 'NN')]
[('until', 'IN'), ('soft', 'JJ'), ('and', 'CC'), ('pulpy', 'NN')]


[['bring'], ['cover'], ['until']]

In [44]:
import string
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [54]:
def is_verb(word):
    return 'v' in set(s.pos() for s in wn.synsets(word) if s.name().split('.')[0]==word)

def remove_non_verb(text):
    pos_tagged = nltk.pos_tag(text)
    if pos_tagged:
        verbs = [pos_tagged[0][0]] + [i[0] for i in pos_tagged[1:] if 'VB' in i[1]] # always include 1st word as nltk doesnt perform well here
        return verbs

In [77]:
data = pd.read_csv('data/test_small.csv', header=0).iloc[8:9]

# convert dtype to list of string
# split text sections by comma
data['processed_steps'] = data['steps'].apply(lambda x: x[1:-1].split(","))
print(list(data['processed_steps'])[0])

["'mix potatoes ", ' bacon ', ' green pepper ', ' onion ', " eggs in a large bowl'", " 'add dressing and lemon juice'", " 'mix lightly'", " 'season to taste with salt and pepper'", " 'regrigerate'"]


In [78]:
# tokenize
data['processed_steps'] = data['processed_steps'].apply(lambda steps: [nltk.word_tokenize(step) for step in steps])
print(list(data['processed_steps'])[0])

[["'mix", 'potatoes'], ['bacon'], ['green', 'pepper'], ['onion'], ['eggs', 'in', 'a', 'large', 'bowl', "'"], ["'add", 'dressing', 'and', 'lemon', 'juice', "'"], ["'mix", 'lightly', "'"], ["'season", 'to', 'taste', 'with', 'salt', 'and', 'pepper', "'"], ["'regrigerate", "'"]]


In [79]:
# drop non-verb words by pos-tag
data['processed_steps'] = data['processed_steps'].apply(lambda steps: [remove_non_verb(step) for step in steps if remove_non_verb(step)])
print(list(data['processed_steps'])[0])

[["'mix"], ['bacon'], ['green'], ['onion'], ['eggs'], ["'add"], ["'mix"], ["'season", 'taste'], ["'regrigerate"]]


In [80]:
# remove punctuation
# + flatten
data['processed_steps'] = data['processed_steps'].apply(lambda steps: [word.translate(str.maketrans('', '', string.punctuation)) for step in steps for word in step])

# lemmatization
data['processed_steps'] = data['processed_steps'].apply(lambda steps: [lemmatizer.lemmatize(word, pos='v') for word in steps])

# stopword removal
# + drop by synsets
data['processed_steps'] = data['processed_steps'].apply(lambda steps: [word for word in steps if (is_verb(word) and word not in stop_words)])

print(data['processed_steps'])

8    [mix, green, egg, add, mix, season, taste]
Name: processed_steps, dtype: object
