In [10]:
import pandas as pd
import string
import html
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from pandarallel import pandarallel
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import joblib
from scipy import sparse
import numpy as np

pandarallel.initialize(nb_workers=8)

nltk.download('wordnet')


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package wordnet to /home/amogus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
recipes_df = pd.read_parquet("~/resources/food/recipes.parquet")
recipes_df["RecipeId"] = recipes_df["RecipeId"].astype(int)
float_cols = recipes_df.dtypes[recipes_df.dtypes != "int64"][
    recipes_df.dtypes == "float64"
].index
recipes_df.loc[:, float_cols] = recipes_df.loc[:, float_cols].fillna(0.0)
categorical_cols = recipes_df.dtypes[recipes_df.dtypes != "int64"][
    recipes_df.dtypes != "float64"
].index
recipes_df.loc[:, categorical_cols] = recipes_df.loc[:, categorical_cols].fillna("")

recipes_df


Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09 21:46:00+00:00,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,[Soak saffron in warm milk for 5 minutes and p...
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"[Mix everything together and bring to a boil.,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,541379,Meg's Fresh Ginger Gingerbread,2002090414,rdsxc,PT35M,PT1H,PT1H35M,2020-12-22 15:27:00+00:00,Make and share this Meg's Fresh Ginger Gingerb...,[],...,7.6,54.4,278.2,48.5,0.8,22.8,3.9,8.0,1 8x8 cake pan,[Preheat oven to 350&deg;F Grease an 8x8 cake ...
522513,541380,Roast Prime Rib au Poivre with Mixed Peppercorns,211566,Denver cooks,PT3H,PT30M,PT3H30M,2020-12-22 15:32:00+00:00,"White, black, green, and pink peppercorns add ...",[https://img.sndimg.com/food/image/upload/w_55...,...,71.4,433.8,766.3,3.2,0.7,0.1,117.0,8.0,1 Roast,[Position rack in center of oven and preheat t...
522514,541381,Kirshwasser Ice Cream,2001131545,Jonathan F.,PT3H,PT1H,PT4H,2020-12-22 15:33:00+00:00,Make and share this Kirshwasser Ice Cream reci...,[],...,72.6,470.9,192.5,33.9,0.0,17.3,12.8,6.0,,[heat half and half and heavy cream to a simme...
522515,541382,Quick & Easy Asian Cucumber Salmon Rolls,2001004241,CLUBFOODY,,PT15M,PT15M,2020-12-22 22:11:00+00:00,"Extremely quick and easy to make, these are gr...",[https://img.sndimg.com/food/image/upload/w_55...,...,0.1,2.9,100.5,0.3,0.0,0.1,2.4,0.0,20 rolls,"[In a small bowl, combine mayo and wasabi past..."


In [8]:
np.unique(recipes_df["Keywords"].explode().fillna(""))


array(['', '< 15 Mins', '< 30 Mins', '< 4 Hours', '< 60 Mins', 'African',
       'Apple', 'Artichoke', 'Asian', 'Australian', 'Austrian', 'Avocado',
       'Baked Beans', 'Baking', 'Bar Cookie', 'Bass', 'Bath/Beauty',
       'Bean Soup', 'Beans', 'Bear', 'Beef Barley Soup', 'Beef Crock Pot',
       'Beef Kidney', 'Beef Liver', 'Beef Organ Meats', 'Beef Sandwiches',
       'Beef Sauces', 'Beginner Cook', 'Belgian', 'Berries', 'Beverages',
       'Birthday', 'Black Beans', 'Brazilian', 'Bread Machine', 'Breads',
       'Breakfast', 'Breakfast Eggs', 'Breakfast Potatoes', 'Broil/Grill',
       'Brown Rice', 'Brunch', 'Cabbage', 'Cajun', 'Cambodian', 'Camping',
       'Canadian', 'Candy', 'Canning', 'Cantonese', 'Caribbean',
       'Catfish', 'Cauliflower', 'Chard', 'Cheese', 'Cheesecake',
       'Cherries', 'Chicken', 'Chicken Breast', 'Chicken Livers',
       'Chicken Stew', 'Chicken Stews', 'Chicken Thigh & Leg', 'Chilean',
       'Chinese', 'Chinese New Year', 'Chocolate Chip Cookies',

In [None]:
recipes_df["RecipeIngredientParts"]


In [53]:
recipes_df.loc[:, "Name"] = recipes_df.loc[:, "Name"].map(lambda x: html.unescape(x))
recipes_df.loc[:, "Description"] = recipes_df.loc[:, "Description"].map(lambda x: html.unescape(x))


In [12]:
float_cols = recipes_df.dtypes[recipes_df.dtypes != "int64"][
    recipes_df.dtypes == "float64"
].index
float_cols


Index(['AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings'],
      dtype='object')

In [13]:
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
stopword_set = set(stopwords.words("english"))
wnl = WordNetLemmatizer()
punc_table = str.maketrans(string.punctuation, " " * len(string.punctuation))

def preprocess(x):
    text: str = html.unescape(x).lower()
    text = text.translate(punc_table)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = ' '.join([wnl.lemmatize(term) for term in word_tokenize(text)])
    text = ' '.join([b for b in word_tokenize(text) if (b in nouns and not b in stopword_set)])
    return text.strip()


In [65]:
texts_df = recipes_df[["Name", "Description"]].join(recipes_df["RecipeInstructions"].transform(" ".join).to_frame("RecipeInstructions")).fillna("").agg(" ".join, axis=1)
texts_df.info()


<class 'pandas.core.series.Series'>
RangeIndex: 522517 entries, 0 to 522516
Series name: None
Non-Null Count   Dtype 
--------------   ----- 
522517 non-null  object
dtypes: object(1)
memory usage: 4.0+ MB


In [67]:
recipes_df["all_texts"] = texts_df


In [16]:
vectorizer = TfidfVectorizer(preprocessor=preprocess)
vectorizer.fit(texts_df)
joblib.dump(vectorizer, "tfidf.joblib")


['tfidf.joblib']

In [18]:
X = vectorizer.transform(texts_df)
sparse.save_npz("~/resources/food/tfidf.npz", X)
X.data.nbytes


<522517x15127 sparse matrix of type '<class 'numpy.float64'>'
	with 22275799 stored elements in Compressed Sparse Row format>

## Search

In [40]:
query = vectorizer.transform(["chicken pie"])
recipes_df.join(pd.Series(cosine_similarity(query, X).reshape(-1)).to_frame("score")).sort_values("score", ascending=False).head(5)


Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,score
411907,427008.0,Chunky Chicken Pot Pie,1620823,Eribrysan,PT30M,PT20M,PT50M,2010-05-24 14:42:00+00:00,Make and share this Chunky Chicken Pot Pie rec...,[],...,57.4,806.1,45.7,2.7,5.1,7.6,,,"[Preheat oven to 400 degrees., Sear and cube c...",0.838781
32353,35996.0,Easy Chicken Pot Pie,47649,Indiana Nurse,PT30M,PT12M,PT42M,2002-08-05 19:24:00+00:00,Make and share this Easy Chicken Pot Pie recip...,[],...,334.4,3144.1,148.5,13.1,20.2,69.4,,1 pie,"[Preheat oven to 400 degrees., In a pie pan, m...",0.806799
383537,397339.0,Chicken Pot Pie,828849,mcmonkhouse,PT1H,PT0S,PT1H,2009-11-02 10:36:00+00:00,Make and share this Chicken Pot Pie recipe fro...,[],...,300.9,589.3,81.0,5.9,4.2,45.0,6.0,6 pies,[Mix.],0.802883
62386,66810.0,Chicken Pot Pie,93095,Shellbelle,PT30M,PT5M,PT35M,2003-07-15 20:06:00+00:00,Make and share this Chicken Pot Pie recipe fro...,[],...,6.1,1002.8,46.5,3.3,0.6,7.3,4.0,,"[Mix soup, Veg-All, and chicken together, add ...",0.801281
519971,538667.0,Chicken Fajita Pie,2002459195,Scottie T.,PT30M,PT15M,PT45M,2019-04-27 12:59:00+00:00,Make and share this Chicken Fajita Pie recipe ...,[],...,100.3,766.0,24.9,4.3,3.4,27.4,8.0,8 slices,"[Preheat oven to 350., Cook chicken according ...",0.77633


In [15]:
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch(
    "https://127.0.0.1:9200",
    basic_auth=(
        "elastic",
        "yHcm1Pyq=jnDL_4gw93i",
    ),
    ca_certs="http_ca.crt"
)
es.info()


ObjectApiResponse({'name': '9429c74bab12', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'PzQzy0-rTiKgmDWiUpey6A', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [68]:
def generate_indices():
    for index, row in recipes_df.iterrows():
        yield {"_index": "recipes", "_id": row.RecipeId, **row.to_dict()}

es.indices.delete(index="recipes")
helpers.bulk(es, generate_indices())


(522517, [])

In [None]:
query = {
    "function_score": {
        "query": {"fuzzy": {"Name": {"value": "sandwich"}}},
        "functions": [
            {
                "script_score": {
                    "script": {"source": "doc['AggregatedRating'].value"},
                },
                "weight": 0.1,
            },
            {
                "script_score": {
                    "script": {"source": "doc['ReviewCount'].value"},
                },
                "weight": 0.05,
            },
            {
                "script_score": {
                    "script": {"source": "_score"},
                },
                "weight": 1,
            },
        ],
        "score_mode": "max",
    }
}
results = es.search(index="recipes", query=query)

pd.DataFrame(
    [[hit["_score"], *hit["_source"].values()] for hit in results["hits"]["hits"]],
    columns=["score"] + list(results["hits"]["hits"][0]["_source"].keys()),
)


In [55]:
recipes_df[recipes_df["AggregatedRating"] > 0]["AggregatedRating"].mean()


4.632013709922984

In [41]:
recommend_query = {
    "function_score": {
        "query": {"match_all": {}},
        "functions": [
            {
                "script_score": {
                    "script": {
                        "source": "(doc['AggregatedRating'].value * doc['ReviewCount'].value + 4.632013709922984 * 100) / (doc['AggregatedRating'].value + 100)"
                    },
                },
                "weight": 0.2,
            },
            {
                "script_score": {
                    "script": {"source": "_score"},
                },
                "weight": 1,
            },
        ],
        "score_mode": "multiply",
    }
}
results = es.search(index="recipes", query=query)

pd.DataFrame(
    [[hit["_score"], *hit["_source"].values()] for hit in results["hits"]["hits"]],
    columns=["score"] + list(results["hits"]["hits"][0]["_source"].keys()),
).head(5)


Unnamed: 0,score,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,29.62614,45809,Bourbon Chicken,58278,LinMarie,PT20M,PT15M,PT35M,2002-11-12T20:13:00+00:00,I searched and finally found this recipe on th...,...,6.5,145.3,1573.0,23.4,0.3,21.5,50.1,4.0,,[Editor's Note: Named Bourbon Chicken because...
1,22.102331,2886,Best Banana Bread,1762,lkadlec,PT1H,PT10M,PT1H10M,1999-09-26T20:49:00+00:00,Make and share this Best Banana Bread recipe f...,...,6.2,61.6,338.3,42.5,1.4,24.4,3.7,10.0,1 loaf,"[Remove odd pots and pans from oven., Preheat ..."
2,16.568998,27208,To Die for Crock Pot Roast,28201,yooper,PT9H,PT5M,PT9H5M,2002-05-03T15:11:00+00:00,"Amazing flavor, and so simple! No salt needed ...",...,3.9,149.8,380.6,2.6,0.1,0.0,49.5,8.0,,"[Place beef roast in crock pot., Mix the dried..."
3,15.559475,39087,Creamy Cajun Chicken Pasta,30534,Lorac,PT15M,PT10M,PT25M,2002-09-02T19:26:00+00:00,Make and share this Creamy Cajun Chicken Pasta...,...,24.0,198.6,816.8,47.4,2.5,3.1,39.3,2.0,,[Place chicken and Cajun seasoning in a bowl a...
4,14.727701,89204,Crock-Pot Chicken With Black Beans &amp; Cream...,137839,Jen Santiago,PT4H,PT3M,PT4H3M,2004-04-16T20:00:00+00:00,I love this Crock-Pot chicken recipe for two r...,...,15.2,155.3,913.4,50.1,11.2,9.9,45.9,4.0,,"[Take 4-5 frozen, yes, frozen, boneless chicke..."


In [72]:
result = es.get(index="recipes", id=45809)
results_df = pd.DataFrame(
    [[*result["_source"].values()]],
    columns=list(result["_source"].keys()),
)
results_df


Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,all_texts
0,45809,Bourbon Chicken,58278,LinMarie,PT20M,PT15M,PT35M,2002-11-12T20:13:00+00:00,I searched and finally found this recipe on th...,[https://img.sndimg.com/food/image/upload/w_55...,...,145.3,1573.0,23.4,0.3,21.5,50.1,4.0,,[Editor's Note: Named Bourbon Chicken because...,Bourbon Chicken I searched and finally found t...


In [129]:
text = "parmasasn spaghett"
suggest_dictionary = {
    "text": text,
    "suggest-1": {"term": {"field": "all_texts"}},
    "suggest-2": {"term": {"field": "Name"}},
    "suggest-3": {"term": {"field": "Description"}},
    "suggest-4": {"term": {"field": "RecipeInstructions"}},

}

query_dictionary = {"suggest": suggest_dictionary}
res = es.search(
    index='recipes',
    body=query_dictionary)

p = []
for term in np.array(list(res["suggest"].values())).T:
    result = {}
    result["text"] = term[0]["text"]
    options = [v["options"] for v in term]
    result["candidates"] = {}
    # df = pd.DataFrame([v["options"] for v in term])
    for option in options:
        candidates = {}
        if len(option) > 0:
            candidates["text"] = option[0]["text"]
            for candidate in option:
                # print(candidate)
                if candidate["text"] not in result["candidates"]:
                    result["candidates"][candidate["text"]] = {"score": candidate["score"], "freq": candidate["freq"]}
                else:
                    result["candidates"][candidate["text"]]["score"] = (result["candidates"][candidate["text"]]["score"] * result["candidates"][candidate["text"]]["freq"] + candidate["score"] * candidate["freq"]) / (result["candidates"][candidate["text"]]["freq"] + candidate["freq"])
                    result["candidates"][candidate["text"]]["freq"] = result["candidates"][candidate["text"]]["freq"] + candidate["freq"]
    p += [result["candidates"]]
out = [""] * len(text.split())
for i, pp in enumerate(p):
    if pp:
        df = pd.DataFrame.from_dict(pp,orient="index")
        R = (df["score"] * df["freq"]).sum() / df["freq"].sum()
        W = df["freq"].mean()
        df["bayes_score"] = (df["score"] * df["freq"] + W * R) / (df["freq"] + W)
        out[i] = df.sort_values("bayes_score", ascending=False).head(1).index[0]
    else:
        out[i] = text.split()[i]
out


['parmasan', 'spaghetti']