In [61]:
import pandas as pd
import re

df1 = pd.read_pickle('Recipes_data_Dietary.pkl')
df2 = pd.read_pickle('Recipes_data_Cuisines.pkl')
df3 = pd.read_pickle('Recipes_data_Course.pkl')

In [62]:
df1.columns = ['Title', 'Ingredients', 'Type']
df2.columns = ['Title', 'Ingredients', 'Type']
df3.columns = ['Title', 'Ingredients', 'Type']

In [63]:
df = pd.concat([df1, df2, df3], ignore_index=True)

In [64]:
def process_ingredients(data):
    data = [re.sub(r'[^a-zA-Z\s]', '', ingredient) for ingredient in data]
    data = [re.sub(r'\([^)]*\)', '', ingredient) for ingredient in data]
    data = list(filter(None, data))
    data = [ingredient.replace('scallions spring or green onions', 'green onions') for ingredient in data]
    data = [ingredient for ingredient in data if len(ingredient) >= 1]
    data = [' '.join(ingredient.split()) for ingredient in data]
    data = [ingredient.split(',')[0] for ingredient in data]
    
    return data



df['Title'] = df['Title'].apply(lambda p: re.sub(r'\([^)]*\)', '', p))

df['Ingredients'] = df['Ingredients'].apply(process_ingredients)
df['Ingredients'] = df['Ingredients'].apply(lambda x: list(set(x)))

df = df.drop_duplicates(subset=['Title','Type'], keep='first')
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Title,Ingredients,Type
0,Alice's Key Lime Pie,"[lime juice, vanilla extract, sugar substitute...",Diabetic
1,Carrot Cake Muffin Treats,"[raisins seedless, nutmeg, baking soda, walnut...",Diabetic
2,Creamy Cucumber Salad,"[vinegar, garlic, yogurt plain, black pepper, ...",Diabetic
3,Creamy Ranch Salad Dressing,"[oregano, green onions, yogurt lowfat, cottage...",Diabetic
4,Diabetic Blueberry Muffins,"[lemon zest, blueberries, biscuit baking mix b...",Diabetic
...,...,...,...
9783,Pork and Scallion Wraps,"[hoisin sauce, pork tenderloin, soy sauce tama...",Appetizers
9784,Pork U Pine Meatballs,"[rice, ground beef, tomato sauce, onions]",Appetizers
9785,Portuguese Garlic Dip,"[mayonnaise, worcestershire sauce, garlic clov...",Appetizers
9786,Potato Skins with Cheese and Bacon,"[chives, russet potatoes, cheddar cheese very ...",Appetizers


In [65]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(df['Ingredients'])



In [16]:
def jaccard_similarity(str1, str2):
    a = set(str1.split())
    b = set(str2.split())
    intersection = len(a.intersection(b))
    union = len(a) + len(b) - intersection
    similarity = intersection / union
    return similarity

def remove_colors(string, color_list):
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(color) for color in color_list) + r')\b', flags=re.IGNORECASE)
    string_without_colors = re.sub(pattern, '', string)
    return string_without_colors.strip()


strings = vectorizer.get_feature_names_out()

color_words = ['red', 'blue', 'green', 'yellow','white','black','brown','english','extract']
exclude_words = ["ground", "sauce", "milk", "oil","seeds","powder","flour","crumbs","broth","salt","beef","zest","vinegar","paste","juice","shortening","noodles","stock"]
smst = []

for i in range(len(strings)):
    for j in range(i + 1, len(strings)):
        str1 = remove_colors(strings[i], color_words)
        str2 = remove_colors(strings[j], color_words)
        similarity = jaccard_similarity(str1, str2)
        if(similarity >= 0.5):
            smst.append([strings[i],strings[j]])
            print(f"Jaccard_similarity between '{strings[i]}' and '{strings[j]}': {similarity}")

Jaccard_similarity between 'Chinese rice vinegar' and 'brown rice vinegar': 0.6666666666666666
Jaccard_similarity between 'Chinese rice vinegar' and 'japanese rice vinegar': 0.5
Jaccard_similarity between 'Chinese rice vinegar' and 'rice vinegar': 0.6666666666666666
Jaccard_similarity between 'Chinese sausage' and 'sausage': 0.5
Jaccard_similarity between 'Italian Dressing' and 'Italian Dressing Mix': 0.6666666666666666
Jaccard_similarity between 'Parmesan cheese' and 'blue cheese': 0.5
Jaccard_similarity between 'Parmesan cheese' and 'cheese': 0.5
Jaccard_similarity between 'Whole wheat crackers' and 'stoneground wheat crackers': 0.5
Jaccard_similarity between 'active starter' and 'starter': 0.5
Jaccard_similarity between 'adobo sauce' and 'sauce': 0.5
Jaccard_similarity between 'aioli sauce' and 'sauce': 0.5
Jaccard_similarity between 'alba chocolate' and 'chocolate': 0.5
Jaccard_similarity between 'alba chocolate' and 'chocolate extract': 0.5
Jaccard_similarity between 'alba chocola

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

strs = [remove_colors(string, color_words) for string in strings]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(strs).toarray()

smst = []
for i in range(len(strings)):
    for j in range(i + 1, len(strings)):
        similarity = cosine_similarity([X[i]], [X[j]])[0][0]
        # a = any(word in str1 for word in exclude_words)
        # b = any(word in str2 for word in exclude_words)
        # xnor = (a and b) or (not a and not b)
        # c = ' ' in str1
        # d = ' ' in str2
        # ord = c or d
        if(similarity > 0.8 ): #and xnor and ord
            smst.append([strings[i],strings[j]])
            print(f"Cosine similarity between '{strings[i]}' and '{strings[j]}': {similarity}")

Cosine similarity between 'Almond Red Sauce' and 'almond extract': 0.8317961530960255
Cosine similarity between 'Apple Cider Sauce' and 'apple cider': 0.9122003986967189
Cosine similarity between 'Chinese rice vinegar' and 'chinese black vinegar': 0.8463294027621928
Cosine similarity between 'Gram flour' and 'gram': 0.8205526300859373
Cosine similarity between 'Italian Dressing' and 'Italian Dressing Mix': 0.868986090359897
Cosine similarity between 'Italian Dressing' and 'salad dressing italian': 0.8137851044433708
Cosine similarity between 'Italian Dressing Mix' and 'salad dressing mix italian': 0.849678952871338
Cosine similarity between 'Lemon Cream Cheese Frosting' and 'cream cheese frosting': 0.8504380488549079
Cosine similarity between 'Roasted Baby Vidalia Onions' and 'baby vidalia onions': 0.8478975191660203
Cosine similarity between 'Sage Buttermilk Biscuits' and 'buttermilk biscuits': 0.8057891831929658
Cosine similarity between 'apple cider' and 'apple cider vinegar': 0.861

In [66]:
def remove_colors(string, color_list):
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(color) for color in color_list) + r')\b', flags=re.IGNORECASE)
    string_without_colors = re.sub(pattern, '', string)
    return string_without_colors.strip()
    
from jellyfish import jaro_winkler_similarity

strings = vectorizer.get_feature_names_out()

color_words = ['red', 'blue', 'green', 'yellow','white','black','brown','english','extract']
exclude_words = ["ground", "sauce", "milk", "oil","seeds","powder","flour","crumbs","broth","salt","beef","zest","vinegar","paste","shortening","noodles","stock"]

smst = []

for i in range(len(strings)):
    for j in range(i + 1, len(strings)):
        str1 = remove_colors(strings[i], color_words)
        str2 = remove_colors(strings[j], color_words)
        similarity = jaro_winkler_similarity(str1, str2)
        a = any(word in str1 for word in exclude_words)
        b = any(word in str2 for word in exclude_words)
        xnor = (a and b) or (not a and not b)
        c = ' ' in str1
        d = ' ' in str2
        ord = c or d
        if(similarity > 0.89 and xnor and ord):
            smst.append([strings[i],strings[j]])
            print(f"Jaro-Winkler similarity between '{strings[i]}' and '{strings[j]}': {similarity}")

Jaro-Winkler similarity between 'Italian Dressing' and 'Italian Dressing Mix': 0.96
Jaro-Winkler similarity between 'allspice' and 'allspice berries': 0.9
Jaro-Winkler similarity between 'almond butter' and 'almond extract': 0.8923076923076924
Jaro-Winkler similarity between 'almond extract' and 'almond meal': 0.9090909090909091
Jaro-Winkler similarity between 'almond milk' and 'almond oil': 0.9436363636363637
Jaro-Winkler similarity between 'apple cider' and 'apple juice': 0.905050505050505
Jaro-Winkler similarity between 'apple cider' and 'apple wine': 0.9054545454545454
Jaro-Winkler similarity between 'apple jelly' and 'apple juice': 0.890909090909091
Jaro-Winkler similarity between 'apple juice' and 'apple juice concentrate': 0.8956521739130435
Jaro-Winkler similarity between 'apple juice' and 'apple wine': 0.9054545454545454
Jaro-Winkler similarity between 'apricot brandy' and 'apricot nectar': 0.8961038961038961
Jaro-Winkler similarity between 'apricots' and 'apricots dried': 0.9

In [67]:
sim_list = []
sim_list.append(smst[0])
for i in range(1,len(smst)):
    k = smst[i]
    if k[0] in sim_list[-1] or k[1] in sim_list[-1]:
        sim_list[-1].append(k[0])
        sim_list[-1].append(k[1])
    else:
        sim_list[-1] = list(set(sim_list[-1]))
        sim_list[-1] = sorted(sim_list[-1], key=len)
        sim_list.append(smst[i])
sim_list[-1] = list(set(sim_list[-1]))
sim_list[-1] = sorted(sim_list[-1], key=len)

In [68]:
wdf = df.copy()

In [69]:
def process_ingredients(data, sim_list):
    updated_data = []
    for ing_list in data:
        updated_ings = []
        for ing in ing_list:
            for sims in sim_list:
                if ing in sims:
                    ing = sims[0]
            updated_ings.append(ing)
        updated_data.append(updated_ings)
    return updated_data

wdf['Ingredients'] = process_ingredients(wdf['Ingredients'], sim_list)


In [70]:
wdf.to_pickle('Recipes_data_Whole_Sim_Ingrs.pkl')
wdf = pd.read_pickle('Recipes_data_Whole_Sim_Ingrs.pkl')

In [71]:
wdf

Unnamed: 0,Title,Ingredients,Type
0,Alice's Key Lime Pie,"[lime juice, vanilla bean, sugar substitute, l...",Diabetic
1,Carrot Cake Muffin Treats,"[raisins seedless, nutmeg, baking soda, walnut...",Diabetic
2,Creamy Cucumber Salad,"[vinegar, garlic, yogurt, black pepper, dill s...",Diabetic
3,Creamy Ranch Salad Dressing,"[oregano, green onions, yogurt, cottage cheese...",Diabetic
4,Diabetic Blueberry Muffins,"[lemon zest, blueberries, biscuit baking mix b...",Diabetic
...,...,...,...
9783,Pork and Scallion Wraps,"[hoisin sauce, pork tenderloin, soy sauce dark...",Appetizers
9784,Pork U Pine Meatballs,"[rice, ground ham, tomato sauce, onions]",Appetizers
9785,Portuguese Garlic Dip,"[mayonnaise, worcestershire sauce, garlic, bla...",Appetizers
9786,Potato Skins with Cheese and Bacon,"[chives, russet potatoes, cheddar cheese, sour...",Appetizers


In [72]:
vectorizer.fit_transform(df['Ingredients'])
len(vectorizer.get_feature_names_out())



2153

In [73]:
vectorizer.fit_transform(wdf['Ingredients'])
len(vectorizer.get_feature_names_out())

1615