In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from scipy.special import expit
from Levenshtein import distance
import os
from typing import List, Dict, Any
import time


class OptimizedSearchEngine:
    def __init__(self, model_path: str = None, recipes_path: str = None):
        self.model = None
        self.recipes = None
        self.vocabulary = None
        
        if model_path and os.path.exists(model_path):
            self.load_model(model_path)
        if recipes_path and os.path.exists(recipes_path):
            self.load_recipes(recipes_path)
            
    def load_model(self, model_path: str):
        """Load a pre-trained Word2Vec model"""
        self.model = Word2Vec.load(model_path)
        self.vocabulary = set(self.model.wv.index_to_key)
        
    def load_recipes(self, recipes_path: str):
        """Load recipes from CSV"""
        self.recipes = pd.read_csv(recipes_path)
        # Convert combined_cleaned from string to list if needed
        if isinstance(self.recipes['combined_cleaned'].iloc[0], str):
            try:
                # Try to evaluate the string as a Python list
                self.recipes['combined_cleaned'] = self.recipes['combined_cleaned'].apply(eval)
            except:
                # If that fails, split the string into words
                self.recipes['combined_cleaned'] = self.recipes['combined_cleaned'].apply(lambda x: x.split())
        
    def _get_document_vector(self, doc: List[str]):
        """Get the average vector for a document"""
        vectors = []
        for word in doc:
            if word in self.vocabulary:
                vectors.append(self.model.wv[word])
        if vectors:
            return np.mean(vectors, axis=0)
        return np.zeros(self.model.vector_size)
        
    def correct_word(self, word: str) -> str:
        """Correct typos in words using Levenshtein distance"""
        if word in self.vocabulary:
            return word
        closest_word = min(self.vocabulary, key=lambda x: distance(word, x))
        return closest_word if distance(word, closest_word) <= 2 else word
        
    def preprocess_query(self, query: str) -> str:
        """Preprocess and correct query words"""
        corrected_words = []
        for word in query.split():
            if word in self.model.wv.index_to_key:  # If word exists in vocabulary, keep it
                corrected_words.append(word)
            else:  # Otherwise, attempt to correct it
                corrected_words.append(self.correct_word(word))
        return ' '.join(corrected_words)
    
    def compute_avg_log_likelihood(self,query, doc, epsilon=1e-10):
        similarity_scores = []
        def sigmoid(x):
            return expit(x)

        for query_word in query.split():
            if query_word in self.model.wv:
                similarities = [self.model.wv.similarity(query_word, doc_word) for doc_word in doc.split()]
                similarity = sigmoid(max(similarities))
                similarity_scores.append(similarity)
        avg_similarity = np.mean(similarity_scores)
        avg_similarity = max(avg_similarity, epsilon)
        log_likelihood = np.log(avg_similarity)
        return log_likelihood

    def execute_search_Word2Vec(self, query):
        relevances = np.zeros(self.recipes.shape[0])
        for index, row in self.recipes.iterrows():
            doc_text = ' '.join(row['combined_cleaned'])
            relevances[index] = self.compute_avg_log_likelihood(query, doc_text)
        return relevances
        
    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Execute search and return top k results"""
        start_time = time.time()
        
        # Preprocess query
        query = self.preprocess_query(query)
        print(query)
        # query_vector = self._get_document_vector(query.split())
        # query = preprocess_query(query)
        
        relevance_scores = self.execute_search_Word2Vec(query)
        print(relevance_scores)
            
        # similarities = np.array(similarities)
        # similarities = expit(similarities)  # Apply sigmoid
        
        # Get top k results
        # top_indices = np.argsort(similarities)[-top_k:][::-1]
        sorted_indices = np.argsort(relevance_scores)[::-1][:top_k]
        print(sorted_indices)
        return sorted_indices

        
        # results = []
        # for idx in top_indices:
        #     recipe = self.recipes.iloc[idx]
        #     results.append({
        #         'Title': recipe['Title'],
        #         'Image_Name': recipe['Image_Name'],
        #         'Instructions': recipe['Instructions'],
        #         'index': recipe['index'],
        #         'relevance_score': float(similarities[idx])
        #     })
            
        # print(f"Search completed in {time.time() - start_time:.2f} seconds")
        # return results
        return top_indices
        
    def save_model(self, model_path: str):
        """Save the model"""
        if self.model:
            self.model.save(model_path) 

In [8]:
import pandas as pd
pd.read_csv("cleaned_recipe_data.csv")

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,index,combined_cleaned,ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...",0,"['misobutter', 'roast', 'chicken', 'acorn', 's...","['chicken', 'kosher salt', 'acorn squash', 'sa..."
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (...",1,"['crispy', 'pepper', 'potatoespreheat', 'line'...","['egg whites', 'potatoes', 'kosher salt', 'gro..."
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ...",2,"['thanksgiving', 'mac', 'cheeseplace', 'rack',...","['evaporated milk', 'whole milk', 'garlic powd..."
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in...",3,"['italian', 'sausage', 'bread', 'stuffingprehe...","['Italian loaf', 'olive oil', 'Italian sausage..."
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",4,"['newtons', 'lawstir', 'together', 'hot', 'coc...","['dark brown sugar', 'hot water', 'bourbon', '..."
...,...,...,...,...,...,...,...
13488,Brownie Pudding Cake,Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408,"['1 cup all-purpose flour', '2/3 cup unsweeten...",13496,"['brownie', 'pudding', 'cakepreheat', 'sift', ...","['all-purpose flour', 'cocoa powder', 'baking ..."
13489,Israeli Couscous with Roasted Butternut Squash...,Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",13497,"['israeli', 'couscous', 'roasted', 'butternut'...","['lemon', 'butternut squash', 'dice', 'olive o..."
13490,Rice with Soy-Glazed Bonito Flakes and Sesame ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...,['Leftover katsuo bushi (dried bonito flakes) ...,13498,"['rice', 'soyglazed', 'bonito', 'flakes', 'ses...","['katsuo bushi', 'bonito flakes', 'dashi', 'ka..."
13491,Spanakopita,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,13499,"['spanakopitamelt', 'moderate', 'spinach', 'st...","['alted butter', 'baby spinach', 'feta', 'nutm..."


In [13]:
SEARCH_ENGINE = OptimizedSearchEngine(
    model_path="word2vec_model",
    recipes_path="cleaned_recipe_data.csv"
)


In [14]:
from search_engine import OptimizedSearchEngine  # if saved in a file like search_engine.py


In [15]:
# Step 4.1: Initialize engine
engine = SEARCH_ENGINE

# Step 4.2: Try searching
engine.search("chicken soup", top_k=3)  # misspelled on purpose to test correction


array([13492,  5687,  5684])