In [None]:
# Install the necessary libraries
!pip install numpy datasets scikit-learn model2vec
    
# Import the necessary libraries
import regex
from collections import Counter

import numpy as np
from datasets import load_dataset, Dataset
from sklearn.metrics.pairwise import cosine_similarity

from model2vec import StaticModel
from model2vec.distill import distill

In [5]:
# Load the recipe dataset
dataset = load_dataset("Shengtao/recipe", split="train")
# Show the column names
print(dataset.column_names)
# Take the title column as our recipes corpus
recipes = dataset["title"]

['title', 'url', 'category', 'author', 'description', 'rating', 'rating_count', 'review_count', 'ingredients', 'directions', 'prep_time', 'cook_time', 'total_time', 'servings', 'yields', 'calories', 'carbohydrates_g', 'sugars_g', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'protein_g', 'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg', 'zinc_mg', 'phosphorus_mg', 'vitamin_a_iu_IU', 'niacin_equivalents_mg', 'vitamin_b6_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg', 'riboflavin_mg', 'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg', 'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g', 'trans_fatty_acid_g', 'omega_3_fatty_acid_g', 'omega_6_fatty_acid_g', 'instructions_list', 'image']


In [8]:
# Define a function to find the most similar titles in a dataset to a given query
def find_most_similar_items(model: StaticModel, dataset: list[str] , query: str , top_k=5) -> list[tuple[str, float]]:
    """
    Finds the most similar items in a dataset to the given query using the specified model.

    :param model: The model used to generate embeddings.
    :param dataset: The dataset of recipe titles.
    :param query: The query recipe title.
    :param top_k: The number of most similar titles to return.
    :return: A list of tuples containing the most similar titles and their cosine similarity scores.
    """
    # Generate embeddings for the entire dataset
    embeddings = model.encode(dataset)

    # Generate embedding for the query
    query_embedding = model.encode(query).reshape(1, -1)

    # Calculate cosine similarities between the query and dataset
    similarities = cosine_similarity(query_embedding, embeddings)[0]

    # Get the indices of the most similar items (sorted in descending order)
    most_similar_indices = np.argsort(similarities)[::-1]

    # Get the top-k most similar titles and their scores
    most_similar_titles = [dataset[i] for i in most_similar_indices[:top_k]]
    most_similar_scores = [similarities[i] for i in most_similar_indices[:top_k]]

    # Combine titles and scores into a list of tuples
    return list(zip(most_similar_titles, most_similar_scores))

In [None]:
# Load the M2V output model from the HuggingFace hub
model_name = "minishlab/M2V_base_output"
model_output = StaticModel.from_pretrained(model_name)

In [19]:
# Find recipes using the output embeddings model
top_k = 5

# Find the most similar recipes to the given queries
query = "cheeseburger"
results = find_most_similar_items(model_output, recipes, query, top_k)
print(f"Most similar recipes to '{query}':")
for title, score in results:
    print(f"Title: {title}, Similarity Score: {score:.4f}")
    
print()

query = "fattoush"
results = find_most_similar_items(model_output, recipes, query, top_k)
print(f"Most similar recipes to '{query}':")
for title, score in results:
    print(f"Title: {title}, Similarity Score: {score:.4f}")
    

Most similar recipes to 'cheeseburger':
Title: Double Cheeseburger, Similarity Score: 0.9028
Title: Cheeseburger Chowder, Similarity Score: 0.8574
Title: Cheeseburger Sliders, Similarity Score: 0.8413
Title: Cheeseburger Salad, Similarity Score: 0.8384
Title: Cheeseburger Soup I, Similarity Score: 0.8298

Most similar recipes to 'fattoush':
Title: Fattoush, Similarity Score: 1.0000
Title: Lebanese Fattoush, Similarity Score: 0.8370
Title: Aunty Terese's Fattoush, Similarity Score: 0.7630
Title: Arabic Fattoush Salad, Similarity Score: 0.7588
Title: Authentic Lebanese Fattoush, Similarity Score: 0.7584


In [None]:
# Load the M2V glove model from the HuggingFace hub
model_name = "minishlab/M2V_base_glove"
model_glove = StaticModel.from_pretrained(model_name)

In [20]:
# Find recipes using the output embeddings model
top_k = 5

# Find the most similar recipes to the given queries
query = "cheeseburger"
results = find_most_similar_items(model_glove, recipes, query, top_k)
print(f"Most similar recipes to '{query}':")
for title, score in results:
    print(f"Title: {title}, Similarity Score: {score:.4f}")
    
print()

# NOTE: 'fattoush' is Out-Of-Vocabulary (OOV) for the GloVe model and will return a zero vector.
query = "fattoush"
results = find_most_similar_items(model_glove, recipes, query, top_k)
print(f"Most similar recipes to '{query}':")
for title, score in results:
    print(f"Title: {title}, Similarity Score: {score:.4f}")
    

Most similar recipes to 'cheeseburger':
Title: Double Cheeseburger, Similarity Score: 0.8744
Title: Cheeseburger Meatloaf, Similarity Score: 0.8246
Title: Cheeseburger Salad, Similarity Score: 0.8160
Title: Hearty American Cheeseburger, Similarity Score: 0.8006
Title: Cheeseburger Chowder, Similarity Score: 0.7989

Most similar recipes to 'fattoush':
Title: Zucchini and Onion Pancake, Similarity Score: 0.0000
Title: Crab Ball, Similarity Score: 0.0000
Title: Shrimp Eggs Foo Yung, Similarity Score: 0.0000
Title: Thai Shrimp and Snow Peas, Similarity Score: 0.0000
Title: Charbroiled Salmon, Similarity Score: 0.0000


In [23]:
# Set up a regex tokenizer to split texts into words and punctuation
my_regex = regex.compile(r"\w+|[^\w\s]+")

# Function to tokenize texts with a progress bar
def tokenize_texts(texts: list[str]) -> list[str]:
    """Tokenizes a list of texts using a regex tokenizer."""
    all_tokens = []
    for text in texts:
        tokens = my_regex.findall(text.lower())
        all_tokens.extend(tokens)
    return all_tokens

In [24]:
# Tokenize the recipe titles
tokens = tokenize_texts(recipes)

# Count the frequency of each token
token_counts = Counter()
batch_size = 10000 
for i in range(0, len(tokens), batch_size):
    batch = tokens[i:i+batch_size]
    token_counts.update(batch)

# Define vocabulary size and get the most common tokens
vocab_size = 30000
vocab = [word for word, count in token_counts.most_common(vocab_size)]

In [26]:
# Choose a Sentence Transformer model
model_name = "BAAI/bge-small-en-v1.5"

# Distill a model2vec model using the Sentence Transformer model and the custom vocab
model_custom = distill(model_name=model_name, vocabulary=vocab, pca_dims=256)

100%|██████████| 8/8 [00:08<00:00,  1.04s/it]


In [27]:
# Find recipes using the output embeddings model
top_k = 5

# Find the most similar recipes to the given queries
query = "cheeseburger"
results = find_most_similar_items(model_custom, recipes, query, top_k)
print(f"Most similar recipes to '{query}':")
for title, score in results:
    print(f"Title: {title}, Similarity Score: {score:.4f}")
    
print()

#'fattoush' is now in the custom vocabulary and will return related recipes.
query = "fattoush"
results = find_most_similar_items(model_custom, recipes, query, top_k)
print(f"Most similar recipes to '{query}':")
for title, score in results:
    print(f"Title: {title}, Similarity Score: {score:.4f}")

Most similar recipes to 'cheeseburger':
Title: Cheeseburger Salad, Similarity Score: 0.9528
Title: Cheeseburger Casserole, Similarity Score: 0.9030
Title: Cheeseburger Chowder, Similarity Score: 0.8635
Title: Cheeseburger Pie, Similarity Score: 0.8401
Title: Cheeseburger Meatloaf, Similarity Score: 0.8184

Most similar recipes to 'fattoush':
Title: Fattoush, Similarity Score: 1.0000
Title: Fatoosh, Similarity Score: 0.7488
Title: Lebanese Fattoush, Similarity Score: 0.6344
Title: Arabic Fattoush Salad, Similarity Score: 0.6108
Title: Fattoush (Lebanese Salad), Similarity Score: 0.5669
