In [5]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import pickle

# Load data
file = 'recipe_final (1).csv'
recipes = pd.read_csv(file)

# Preprocessing: Create TF-IDF matrix from the ingredients list
vectorizerElement = TfidfVectorizer()
X_ing = vectorizerElement.fit_transform(recipes['ingredients_list'])

# Normalize numerical features
scalerElement = StandardScaler()
X_num = scalerElement.fit_transform(recipes[['calories', 'fat', 'carbohydrates', 'protein', 'cholesterol', 'sodium', 'fiber']])

# Convert numerical features to a sparse matrix (they are only 7 features, so it’s efficient)
X_num_sparse = csr_matrix(X_num)

# Set a very high weight for the ingredient features to increase their importance
ingredient_weight = 100.0  # Increase this value for an even higher focus on ingredients

# Weight the TF-IDF features using the ingredient_weight and keep them sparse
X_ing_weighted = X_ing * ingredient_weight

# Combine the numerical and text features using a sparse horizontal stack
X_com = hstack([X_num_sparse, X_ing_weighted])

# Fit KNN using the 'brute' algorithm (which supports sparse matrices)
KNN = NearestNeighbors(n_neighbors=3, metric='euclidean', algorithm='brute')
KNN.fit(X_com)

def predict(input):
    """
    Predict similar recipes based on a combined feature vector.
    
    Parameters:
    - input: list where the first 7 elements are numerical features in order:
             [calories, fat, carbohydrates, protein, cholesterol, sodium, fiber]
             and the 8th element is the ingredients string.
             
    Returns:
    - A pandas DataFrame with the recommended recipes (columns: recipe_name, ingredients_list, image_url).
    """
    # Scale the numerical portion and convert to sparse format
    scaledinput = scalerElement.transform([input[:7]])
    scaledinput_sparse = csr_matrix(scaledinput)
    
    # Transform the ingredient text and weight it using the high ingredient_weight
    input_ing_trans = vectorizerElement.transform([input[7]]) * ingredient_weight
    
    # Combine numerical and textual input features
    cominputs = hstack([scaledinput_sparse, input_ing_trans])
    
    # Find the nearest neighbors
    distances, indexes = KNN.kneighbors(cominputs)
    recoms = recipes.iloc[indexes[0]]
    return recoms[['recipe_name', 'ingredients_list', 'image_url']]

# Example prediction
inputf = [28, 39, 1, 42, 24, 89, 2, 'egg, bread']
a = predict(inputf)
print(a)

# Save the fitted models and transformers to disk
with open('modelRecipe.pkl', 'wb') as f:
    pickle.dump(KNN, f)

with open('scalerElement.pkl', 'wb') as f:
    pickle.dump(scalerElement, f)

with open('vectorizerElement.pkl', 'wb') as f:
    pickle.dump(vectorizerElement, f)


                       recipe_name  \
33764                Egg in a Boat   
31834               Sunshine Toast   
42950  Campfire Breakfast Sandwich   

                                   ingredients_list  \
33764              ['butter', 'white bread', 'egg']   
31834   ['butter', 'bread', 'egg', 'salt to taste']   
42950  ['butter', 'bread', 'egg', 'Cheddar cheese']   

                                               image_url  
33764  https://images.media-allrecipes.com/userphotos...  
31834  https://images.media-allrecipes.com/userphotos...  
42950  http://images.media-allrecipes.com/userphotos/...  


