In [98]:
import pandas as pd
from json import loads

In [99]:
df = pd.read_csv('../parsers/output/parsed_recipes.csv')

df['tags'] = df['tags'].apply(loads)
df['ingredients'] = df['ingredients'].apply(loads)

In [100]:
df.columns

Index(['name', 'author_name', 'cooking_time', 'preparation_time', 'created_at',
       'description', 'images', 'RecipeCategory', 'tags', 'calories',
       'total_fat', 'saturated_fat', 'cholesterol', 'sodium', 'carbohydrates',
       'fiber', 'sugar', 'protein', 'servings', 'steps', 'ingredients',
       'serving_size'],
      dtype='object')

In [101]:
len(df['ingredients'].explode().map(lambda el: 'quantity' in el))

4629646

In [102]:
df['ingredients'].explode().map(lambda el: el['name']).value_counts().sort_values()

ingredients
ground sweet Italian sausage (remove from casing if using links)         1
quick-cooking oatmeal (3 soup spoons)                                    1
other berries or 2 cups combination fruit                                1
tamari or 6 tablespoons soy sauce                                        1
bananas (a little under ripe holds up best)                              1
                                                                     ...  
water                                                                68866
eggs                                                                 73932
butter                                                               97135
sugar                                                               102520
salt                                                                181736
Name: count, Length: 424781, dtype: int64

In [103]:
x = df['ingredients'].explode().map(lambda el: el['name']).value_counts().sort_values()

x = x[x > 9]

In [104]:
x = pd.Series(x.index)

In [105]:
x

0                          vegetarian sausage
1                       oil or 1/4 cup butter
2                   spring mixed salad greens
3                   tablespoons minced garlic
4        chicken stock or 1 cup chicken broth
                         ...                 
13673                                   water
13674                                    eggs
13675                                  butter
13676                                   sugar
13677                                    salt
Name: ingredients, Length: 13678, dtype: object

In [111]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [112]:
import torch
from torch.nn.functional import cosine_similarity

string_embeddings = model.encode(x, convert_to_tensor=True)

def find_most_similar_string(new_string: str):
    new_string_embedding = model.encode(new_string, convert_to_tensor=True)

    similarities = cosine_similarity(new_string_embedding.unsqueeze(0), string_embeddings)

    most_similar_index = torch.argmax(similarities).item()
    most_similar_string = x[most_similar_index]
    similarity_score = similarities[most_similar_index].item()

    return most_similar_string, similarity_score

new_string = 'finely chopped parsley (or to taste'

most_similar, score = find_most_similar_string(new_string)
print(f"The most similar string to '{new_string}' is '{most_similar}' with a similarity score of {score:.4f}.")

0                          vegetarian sausage
1                       oil or 1/4 cup butter
2                   spring mixed salad greens
3                   tablespoons minced garlic
4        chicken stock or 1 cup chicken broth
                         ...                 
13673                                   water
13674                                    eggs
13675                                  butter
13676                                   sugar
13677                                    salt
Name: ingredients, Length: 13678, dtype: object
12618
The most similar string to 'finely chopped parsley (or to taste' is 'finely chopped parsley' with a similarity score of 0.9457.


In [114]:
new_string = 'heavy cream (or less)'

most_similar, score = find_most_similar_string(new_string)
print(f"The most similar string to '{new_string}' is '{most_similar}' with a similarity score of {score:.4f}.")

0                          vegetarian sausage
1                       oil or 1/4 cup butter
2                   spring mixed salad greens
3                   tablespoons minced garlic
4        chicken stock or 1 cup chicken broth
                         ...                 
13673                                   water
13674                                    eggs
13675                                  butter
13676                                   sugar
13677                                    salt
Name: ingredients, Length: 13678, dtype: object
13628
The most similar string to 'heavy cream (or less)' is 'heavy cream' with a similarity score of 0.9406.
