In [2]:
import pandas as pd
import numpy as np
import ast
import re
import spacy

In [3]:
# Load the dataset
data = pd.read_csv("ingredients_list.csv")

In [None]:
# print(data.shape)
# print(data.columns)
# print(data.head(3))

In [4]:
print(data['ingredients_for_tfidf'].head())
print(type(data['ingredients_for_tfidf'].iloc[0]))

0    ['cans cherry pie filling', 'eggs', 'sweetened...
1    ['corned beef', 'thousand island dressing', 's...
2    ['butter at', 'sugar', 'vegetable oil', 'eggs'...
3    ['pkge.orange cake mix', 'pkge. instant vanill...
4    ['butter', 'sugar', 'milk', 'vanilla extract',...
Name: ingredients_for_tfidf, dtype: object
<class 'str'>


In [5]:
def safe_parse_ingredients(ingredient_string):
    """Parse ingredient string safely, handling inner quotes."""
    if not isinstance(ingredient_string, str):
        return []
    try:
        return ast.literal_eval(ingredient_string)
    except Exception:
        cleaned = re.sub(r'\"(.*?)\"', r"'\1'", ingredient_string)
        try:
            return ast.literal_eval(cleaned)
        except Exception:
            return []

In [6]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def normalize(ingredient):
    """Lowercase, lemmatize, keep only alphabetic words."""
    return " ".join([lemmatizer.lemmatize(word.lower()) 
                     for word in ingredient.split() if word.isalpha()])

In [7]:
data['ingredients_str'] = data['ingredients_for_tfidf'].apply(
    lambda ing_list: ' '.join([normalize(ing) for ing in safe_parse_ingredients(ing_list)])
)

In [8]:
print(data.head(3))

       id                             name  \
0   71247          Cherry Streusel Cobbler   
1   76133  Reuben and Swiss Casserole Bake   
2  503816                 Yam-Pecan Recipe   

                                         description  \
0  I haven't made this in years, so I'm just gues...   
1  I think this is even better than a reuben sand...   
2  A lady I work with heard me taking about ZWT a...   

                                         ingredients  \
0  ["cherry pie filling", "condensed milk", "melt...   
1  ["corned beef chopped", "sauerkraut cold water...   
2  ["unsalted butter", "vegetable oil", "all - pu...   

                                     ingredients_raw  \
0  ["2 (21   ounce) cans   cherry pie filling","2...   
1  ["1/2-1   lb    corned beef, cooked and choppe...   
2  ["3/4  cup    unsalted butter, at room tempera...   

                                               steps  servings serving_size  \
0  ["Preheat oven to 375°F.", "Spread cherry pie ...       6.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",   # optional, removes common words
    token_pattern=r"(?u)\b[a-zA-Z]+\b"  # only keep alphabetic tokens
)

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(data['ingredients_str'])

print("TF-IDF shape:", tfidf_matrix.shape)
print("Example vector (first row):")

TF-IDF shape: (500471, 19351)
Example vector (first row):
[[0. 0. 0. ... 0. 0. 0.]]


In [None]:
#Based only on TF-IDF

def get_recommendations_tfidf_only(user_ingredients, top_n=5):
    """
    Recommend recipes using TF-IDF similarity only.
    """
    # Normalize user ingredients for TF-IDF text
    user_norm = [normalize(ing) for ing in user_ingredients]
    user_text = ' '.join(user_norm)
    user_vector = vectorizer.transform([user_text])
    
    tfidf_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    
    top_indices = tfidf_scores.argsort()[-top_n:][::-1]

    recommendations = []
    for idx in top_indices:
        recipe = data.iloc[idx]
        ingredients_set = set([normalize(ing) for ing in safe_parse_ingredients(recipe['cleaned_ingredients'])])
        recommendations.append({
            'index': idx,
            'name': recipe['name'],
            'ingredients': safe_parse_ingredients(recipe['cleaned_ingredients']),
            'steps': recipe['steps'] if isinstance(recipe['steps'], str) else str(recipe['steps']),
            'tfidf_score': tfidf_scores[idx],
            'score': tfidf_scores[idx],  # same as TF-IDF
            'matching_ingredients': [],
            'missing_ingredients': [],
            'coverage_score': None,
            'total_ingredients': len(ingredients_set)
        })
    return recommendations

def display_recommendations(results, user_ingredients):
    # Normalize user ingredients for comparison
    user_set = set(normalize(ing) for ing in user_ingredients)

    for i, recipe in enumerate(results, 1):
        print(f"🍴 Recommendation {i}: {recipe['name']}")
        print(f"   📊 Similarity Score: {recipe['tfidf_score']*100:.1f}%")

        ingredients_display = []
        have_count, total_count = 0, len(recipe['ingredients'])

        for ing in recipe['ingredients']:
            norm_ing = normalize(ing)
            if norm_ing in user_set:
                ingredients_display.append(f"✅ {ing}")
                have_count += 1
            else:
                ingredients_display.append(f"🛒 {ing}")

        print("   📝 Ingredients:")
        print("      " + ", ".join(ingredients_display))
        print(f"   ✅ You have {have_count}/{total_count} ingredients")

        # Show steps
        steps = parse_steps(recipe['steps'])
        print("   📝 Recipe Steps:")
        for j, step in enumerate(steps, 1):
            print(f"      {j}. {step}")
            
        print("-" * 60)

In [10]:
def parse_steps(steps_str):
    """Convert steps from string to clean list of instructions."""
    try:
        steps = ast.literal_eval(steps_str)  # try parsing
        if isinstance(steps, list):
            return [s.strip().strip('"').strip("'") for s in steps if s]
        elif isinstance(steps, str):
            return [steps.strip()]
        else:
            return [str(steps)]
    except:
        # fallback if parsing fails
        return [steps_str.strip()]


In [13]:

def get_recommendations_tfidf_coverage(user_ingredients, top_n=5):
    """
    Recommend recipes using both TF-IDF similarity and ingredient coverage.
    """
    # Normalize user ingredients
    user_norm = [normalize(ing) for ing in user_ingredients]
    user_set = set(user_norm)
    
    # TF-IDF similarity
    user_text = ' '.join(user_norm)
    user_vector = vectorizer.transform([user_text])
    tfidf_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    
    recommendations = []
    
    for idx, recipe in data.iterrows():
        recipe_set = recipe['ingredients_norm_set']  # precomputed normalized ingredient set
        overlap = len(user_set & recipe_set)
        coverage_score = overlap / len(recipe_set) if recipe_set else 0
        
        final_score = 0.4 * tfidf_scores[idx] + 0.6 * coverage_score
        
        recommendations.append({
            'index': idx,
            'name': recipe['name'],
            'steps': recipe['steps'] if isinstance(recipe['steps'], str) else str(recipe['steps']),
            'ingredients': safe_parse_ingredients(recipe['ingredients_for_tfidf']),
            'matching_ingredients': list(user_set & recipe_set),
            'missing_ingredients': list(recipe_set - user_set),
            'total_ingredients': len(recipe_set),
            'coverage_score': coverage_score,
            'tfidf_score': tfidf_scores[idx],
            'score': final_score
        })
    
    # Sort by combined score
    return sorted(recommendations, key=lambda x: x['score'], reverse=True)[:top_n]


In [14]:
data['ingredients_norm_set'] = data['ingredients_for_tfidf'].apply(
    lambda ing_list: set([normalize(ing) for ing in safe_parse_ingredients(ing_list)])
)

In [16]:
from IPython.display import display, HTML

def display_recommendations_html(recommendations, top_n=5):
    user_set = set(normalize(ing) for ing in user_ingredients)

    rows = []
    for i, recipe in enumerate(recommendations[:top_n], 1):
        coverage_pct = recipe['coverage_score'] * 100
        bar_html = f"""
            <div style="background:#eee;width:200px;">
                <div style="background:#4caf50;width:{coverage_pct}%;color:white;text-align:center;">
                    {coverage_pct:.1f}%
                </div>
            </div>
        """

        ingredients_html = []
        for ing in recipe['ingredients']:
            norm_ing = normalize(ing)
            if norm_ing in user_set:
                ingredients_html.append(f"<span style='color:green;'>✅ {ing}</span>")
            else:
                ingredients_html.append(f"<span style='color:gray;'>🛒 {ing}</span>")

        steps = parse_steps(recipe['steps'])
        steps_html = "<br>".join([f"{j+1}. {s}" for j, s in enumerate(steps)])

        row = f"""
        <div style="border:1px solid #ddd;padding:10px;margin:10px;border-radius:8px;">
            <h3>{i}. 📗 {recipe['name']}</h3>
            {bar_html}<br>
            <b>Ingredients:</b> {" | ".join(ingredients_html)} <br><br>
            <b>Similarity:</b> {recipe['tfidf_score']:.3f} | 
            <b>Coverage:</b> {recipe['coverage_score']:.3f} <br><br>
            <b>Steps:</b><br>{steps_html}
        </div>
        """
        rows.append(row)

    display(HTML("".join(rows)))


In [26]:
user_ingredients = [
    "eggs",
    "oil",
    "onion",
    "tomatoes",
    "paprika",
    "garlic",
    "pepper",
    "salt",
    "parsley"]
results = get_recommendations_tfidf_coverage(user_ingredients, top_n=5)
display_recommendations_html(results, top_n=3)

In [17]:
user_ingredients = ["apples", "all-purpose flour", "butter", "sugar", "brown sugar", "cinnamon", "nutmeg", "lemon juice", "egg", "salt"]
results = get_recommendations_tfidf_coverage(user_ingredients, top_n=5)
display_recommendations_html(results, top_n=3)


In [24]:
user_ingredients = [
    "spaghetti",
    "olive oil",
    "garlic",
    "onion",
    "crushed tomatoes",
    "tomato paste",
    "basil",
    "oregano",
    "parmesan cheese",
    "salt",
    "black pepper"]
results = get_recommendations_tfidf_coverage(user_ingredients, top_n=5)
display_recommendations_html(results, top_n=3)

In [27]:
user_ingredients = [
    "olive oil",
    "chicken breasts",
    "salt",
    "pepper",
    "mushrooms",
    "shallot",
    "dry wine",
    "chicken stock",
    "cream",
    "grainy mustard",
    "tarragon",
    "egg noodles"]
results = get_recommendations_tfidf_coverage(user_ingredients, top_n=5)
display_recommendations_html(results, top_n=3)
