# Installing the needed packages

In [32]:
!pip install unidecode



# Importing libraries and necessary downloads

In [33]:
import pandas as pd
import numpy as np
import string
import ast

import re
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [34]:
df=pd.read_csv("../datasets/reduced_dataset.csv")

In [35]:
df.shape

(298331, 11)

We are redefining the `NER_list` as a list of ingredients instead of a string.

In [36]:
df['NER_list']=df['NER'].apply(lambda x: ast.literal_eval(x.lower()))

We are cleaning the ingredients by removing punctuations, and words/letter that have a length inferior to 2 in a composite ingredients. For instance, `a fish`, would be transform into `fish`.

In [37]:
STOPWORDS = set(stopwords.words('english'))

def clean_ingredient_list(ingredient_list):
    cleaned = []
    for item in ingredient_list:
        item_clean = item.lower().translate(str.maketrans('', '', string.punctuation)).strip()

        words = item_clean.split()
        filtered_words = [w for w in words if w not in STOPWORDS and len(w) > 2]
        
        if filtered_words:
            cleaned.append(' '.join(filtered_words))
    return cleaned

In [38]:
df['NER_clean']=df['NER_list'].apply(clean_ingredient_list)

Taking the absolute frequencies of ingredienst through `Counter`.

In [39]:
all_ingredients = [ingredient for sublist in df['NER_clean'] for ingredient in sublist]
# Counting frequencies
ingredient_counts = Counter(all_ingredients)
total = sum(ingredient_counts.values())

sorted_ingredients = ingredient_counts.most_common()

Total number of unique ingredients.

In [40]:
len(ingredient_counts)

57579

In [41]:
sorted_ingredients[-20:]

[('dried italian seasoning', 1),
 ('tablspn oilve oil', 1),
 ('torn mint leaf', 1),
 ('recipe basic meatloaf', 1),
 ('nonfat alpine', 1),
 ('potatoes washed big', 1),
 ('green chili optional', 1),
 ('moreparmesan cheese', 1),
 ('fresh zuccini', 1),
 ('green zuccini', 1),
 ('eggplant black olive spread', 1),
 ('zuchinia', 1),
 ('baby spinach loosely', 1),
 ('corn hominy', 1),
 ('yellow boiling potato', 1),
 ('sour rye bread', 1),
 ('zwack', 1),
 ('recipe pizza crust', 1),
 ('zest two lemons', 1),
 ('scallions onions', 1)]

We are going to exclude all those ingredients that represent less than 10% of the overall number of ingredients by count.

In [42]:
cumulative = 0
top_ingredients = set()

for ingredient, count in sorted_ingredients:
    cumulative += count
    top_ingredients.add(ingredient)
    if cumulative / total >= 0.9:
        break

# Function to check for rare ingredients
def has_rare_ingredient(ingredients):
    return any(ing not in top_ingredients for ing in ingredients)

filtered_df = df[~df['NER_clean'].apply(has_rare_ingredient)].reset_index(drop=True)

Shape of the filtered dataframe

In [43]:
filtered_df.shape

(144759, 12)

Counting frequency of the remaining ingredients after filtering for top 90%

In [44]:
all_ingredients_fil = [ingredient for sublist in filtered_df['NER_clean'] for ingredient in sublist]

ingredient_counts = Counter(all_ingredients_fil)
sorted_ingredients = ingredient_counts.most_common()

In [45]:
len(ingredient_counts)

1460

In [46]:
# ingredients_df = pd.DataFrame(sorted_ingredients, columns=['ingredient', 'count'])

# # Export to CSV
# ingredients_df.to_csv('DATASET/sorted_ingredients.csv', index=True)

Computing the TF-IDF vectorization with the column `NER_clean`

In [47]:
ingredient_list = list(top_ingredients)

# Join ingredients to strings to use with CountVectorizer
filtered_df['ingredients_str'] = filtered_df['NER_clean'].apply(lambda x: ' '.join(x))

# Only use frequent ingredients
# vectorizer = CountVectorizer(vocabulary=ingredient_list, binary=True)
vectorizer =TfidfVectorizer(vocabulary=ingredient_list, binary=True)
X = vectorizer.fit_transform(filtered_df['ingredients_str'])

# Convert to DataFrame if you want to inspect
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Extracting the cooking method

In [52]:
def extract_verbs(text):
  tokens=word_tokenize(text)
  tagged=pos_tag(tokens)
  verbs=[word for word,tag in tagged if tag.startswith('VB')]
  return verbs

In [53]:
def print_tag(text):
  tokens=word_tokenize(text)
  tagged=pos_tag(tokens)
  verbs=[f"{word}_{tag}" for word,tag in tagged]
  print(verbs)

lemmatizer = WordNetLemmatizer()

def advanced_preprocess(txt):
    txt = unidecode(txt.lower())
    txt = re.sub(r'[^a-z\s]', ' ', txt)
    tokens = txt.split()

    tokens = [w for w in tokens if w not in STOPWORDS]

    tokens = [t for t in tokens if len(t) > 2]

    return " ".join(tokens)

Defining the set of all remaining ingredients

In [54]:
ingredients=set(all_ingredients_fil)

Defining the function to remove all the verbs that are also ingredients

In [55]:
def filter_verbs(verbs):
    return [v for v in verbs if v.lower() not in ingredients]

In [56]:
filtered_df['verbs'] =filtered_df['directions'].apply(advanced_preprocess).apply(extract_verbs)

Defining a function that lemmatize the verbs

In [57]:
def lemmatization(text):
  tokens = [lemmatizer.lemmatize(t,'v') for t in text]
  return tokens

In [58]:
filtered_df['lemmed_verbs']=filtered_df['verbs'].apply(lemmatization)

Filtering lemmatized verbs and defining a set of verbs to extract unique verbs

In [59]:
filtered_df['filtered_verbs'] = filtered_df['lemmed_verbs'].apply(filter_verbs)
filtered_df['set_verbs']=filtered_df['filtered_verbs'].apply(set)

Defining a list of all verbs present in the recipes

In [60]:
tecniques = [tech for subset in filtered_df['set_verbs'] for tech in subset]

In [61]:
len(tecniques)

983812

Number of unique verbs

In [62]:
len(set(tecniques))

6908

# Filtering recipes for tecniques

Computing the absolute frequency of tecniques and sorting them

In [63]:
tecniques_counts = Counter(tecniques)
total = sum(tecniques_counts.values())

sorted_tecniques = tecniques_counts.most_common()

Computing the top 90% of verbs frequency

In [64]:
cumulative = 0
top_tecniques = set()

for tech, count in sorted_tecniques:
    cumulative += count
    top_tecniques.add(tech)
    if cumulative / total >= 0.90:
        break

def has_rare_tecniques(tecniques):
    return any(tec not in top_tecniques for tec in tecniques)

Filtering the dataset removing all the recipes that contains at least one of the verbs in the bottom 10%

In [65]:
verbs_df = filtered_df[~filtered_df['lemmed_verbs'].apply(has_rare_tecniques)].reset_index(drop=True)

In [66]:
verbs_df.shape

(25976, 18)

Checking remaining tecniques

In [67]:
tecniques = [tech for subset in verbs_df['set_verbs'] for tech in subset]

In [68]:
len(set(tecniques))

233

# Extracting Bigrams of Tecnique-Ingredient

In [69]:
verbs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25976 non-null  int64 
 1   title            25976 non-null  object
 2   ingredients      25976 non-null  object
 3   directions       25976 non-null  object
 4   link             25976 non-null  object
 5   source           25976 non-null  object
 6   NER              25976 non-null  object
 7   NER_list         25976 non-null  object
 8   NER_len          25976 non-null  int64 
 9   directions_len   25976 non-null  int64 
 10  title_lower      25976 non-null  object
 11  NER_clean        25976 non-null  object
 12  ingredients_str  25976 non-null  object
 13  cluster          25976 non-null  int32 
 14  verbs            25976 non-null  object
 15  lemmed_verbs     25976 non-null  object
 16  filtered_verbs   25976 non-null  object
 17  set_verbs        25976 non-null

We identify some ingredients that are ambiguous or wrong, hence the recipes containing at least one of those get removed.

In [70]:
to_discard=set(['mix','ready','summer','containers','favorite','stock','thin','regular','baking'])

Defining a function to discard the recipes that contained `to_discard` ingredients

In [71]:
def contains_discarded_ingredient(ner_list):
    return any(ingredient in to_discard for ingredient in ner_list)

verbs_df = verbs_df[~verbs_df['NER_list'].apply(contains_discarded_ingredient)].reset_index(drop=True)

Checking the shape of the remaining filtered `verbs_df`

In [72]:
verbs_df.shape

(25758, 18)

Redefining the set of ingredients in the remaining recipes

In [73]:
ingredients=set([ingredient for sublist in verbs_df['NER_clean'] for ingredient in sublist])

Only one additional ingredient was removed inadvertedly, because of collinearity, or fully contained within one or more recipes that contained at least one element in `to_discard`

In [74]:
len(ingredients)

1451

Defining a soft-preprocessing function

In [75]:
def soft_preprocess(txt):
    txt = unidecode(txt.lower())
    txt = re.sub(r'[^a-z\s]', ' ', txt)
    tokens = txt.split()

    tokens = [w for w in tokens if w not in STOPWORDS]

    tokens = [t for t in tokens if len(t) > 2]

    return " ".join(tokens)

Creating a cleaned version of the directions, that removes stopwords and tokens with length less than 3

In [76]:
verbs_df['clean_direction']=verbs_df['directions'].apply(soft_preprocess)

Defining a funciton that extract the pairs verb-ingredient, and it checks if the next three words after the verb are ingredient, and if that is the case then they are added to the list of pairs as strings.

In [77]:
def extract_verb_ingredient_pairs(text, verbs, ingredients):
    tokens = text.split()
    pairs = []

    for i in range(len(tokens) - 2):
        first = lemmatizer.lemmatize(tokens[i],'v')
        second = tokens[i + 1]
        third=tokens[i+2]
        # fourth=tokens[i+3]

        if first in verbs and second in ingredients:
            pairs.append(f"{first} {second}")

        if first in verbs and third in ingredients:
            pairs.append(f"{first} {third}")

        # if first in verbs and fourth in ingredients:
        #     pairs.append(f"{first} {fourth}")

    return pairs

Computing the pairs

In [78]:
verbs_df['pairs']= verbs_df.apply(
    lambda row: extract_verb_ingredient_pairs(row['clean_direction'], row['filtered_verbs'], ingredients),
    axis=1
)

Defining the set of pairs

In [79]:
verbs_df['pairs_set']=verbs_df['pairs'].apply(set)

In [80]:
verbs_df.head(2)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,NER_list,NER_len,directions_len,...,NER_clean,ingredients_str,cluster,verbs,lemmed_verbs,filtered_verbs,set_verbs,clean_direction,pairs,pairs_set
0,5,""" Bar"" Cheese","[""32 ounces Velveeta cheese (regular)"", ""1 1/2...","[""In a large sauce pan over low heat, melt the...",www.food.com/recipe/bar-cheese-42151,Gathered,"[""Velveeta cheese"", ""mayonnaise"", ""horseradish...","[velveeta cheese, mayonnaise, horseradish, tab...",5,368,...,"[velveeta cheese, mayonnaise, horseradish, tab...",velveeta cheese mayonnaise horseradish tabasco...,1,"[pan, add, stirring, add, stirring, cool, refr...","[pan, add, stir, add, stir, cool, refrigerate]","[pan, add, stir, add, stir, cool, refrigerate]","{stir, cool, pan, add, refrigerate}",large sauce pan low heat melt velveeta add jar...,"[add horseradish, add mayo, add worcestershire...","{refrigerate choice, add mayo, add worcestersh..."
1,6,""" Barber's"" Chewy Vanilla & Salted Butter Toffee","[""2 14 cups icing sugar"", ""12 cup milk"", ""14 c...","[""Butter a loaf pan."", ""In a pot,combine icing...",www.food.com/recipe/barbers-chewy-vanilla-salt...,Recipes1M,"[""icing sugar"", ""milk"", ""corn syrup"", ""vanilla...","[icing sugar, milk, corn syrup, vanilla bean, ...",5,557,...,"[icing sugar, milk, corn syrup, vanilla bean, ...",icing sugar milk corn syrup vanilla bean butter,3,"[pot, stirring, reduce, remove, stirring, reac...","[pot, stir, reduce, remove, stir, reach, pour,...","[pot, stir, reduce, remove, stir, reach, pour,...","{pot, cut, pour, stir, remove, reduce, reach}",butter loaf pan pot combine icing sugar milk c...,"[remove vanilla, stir butter]","{stir butter, remove vanilla}"


In [81]:
empty_pairs = verbs_df[verbs_df['pairs'].apply(lambda x: len(x) == 0)]
print(len(empty_pairs))

1527


Checking for 'odd' pairs

In [82]:
all_pairs = [pair for sublist in verbs_df['pairs'] for pair in sublist]

pair_counts = Counter(all_pairs)
# Most common pairs
print(pair_counts.most_common(20))

[('remain ingredients', 1978), ('add flour', 1322), ('add water', 1288), ('melt butter', 1228), ('add milk', 1182), ('add ingredients', 1151), ('add sugar', 1128), ('add salt', 1116), ('boil water', 961), ('add garlic', 934), ('add onion', 851), ('add vanilla', 811), ('add eggs', 781), ('add butter', 758), ('add cheese', 611), ('add cream', 604), ('bake soda', 597), ('add egg', 576), ('bake salt', 574), ('add oil', 566)]


Checking for the bottom frequent pairs

In [83]:
# Less common pairs
least_common_pairs = pair_counts.most_common()[-50:]

for pair, count in least_common_pairs:
    print(f"{pair}: {count}")


save egg: 1
save white: 1
fill rolls: 1
toast cheddar: 1
save crust: 1
drain margarine: 1
taste handful: 1
crisp tomato: 1
transfer potato: 1
shake cereal: 1
warm yeast: 1
run jalapenos: 1
start chocolate: 1
saute ginger: 1
want lime: 1
fill ricotta: 1
cook cilantro: 1
mince vinegar: 1
season mustard: 1
peel fish: 1
chill spinach: 1
press water: 1
wrap mayo: 1
simmer wine: 1
tender zucchini: 1
smoke beef: 1
smoke olives: 1
shred feta: 1
bowl tuna: 1
cookie cheese: 1
add angel: 1
shake parmesan: 1
fold herbs: 1
stuff oil: 1
put cumin: 1
stuff zucchini: 1
soften shell: 1
degrees vanilla: 1
spread zucchini: 1
follow crust: 1
crumb taco: 1
let zucchini: 1
stand zucchini: 1
make pesto: 1
combine mint: 1
desire zucchini: 1
uncover zucchini: 1
shake zucchini: 1
smoke zucchini: 1
dissolve figs: 1


In [84]:
verbs_df.to_csv('../datasets/ds_verbs.csv')