# Import Packages

In [None]:
!pip install gensim
!pip install spacy
!pip install streamlit

In [None]:
# Data handling
import json
import pandas as pd
import numpy as np
import pickle

# NLP
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import re

# Model building
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Interface
import streamlit as st

# Import data and format it

In [None]:
# Load the JSON files
with open('recipes_raw_nosource_ar.json', 'r') as json_file:
    data1 = json.load(json_file)
with open('recipes_raw_nosource_epi.json', 'r') as json_file:
    data2 = json.load(json_file)
with open('recipes_raw_nosource_fn.json', 'r') as json_file:
    data3 = json.load(json_file)

# Remove "ADVERTISEMENT" from the JSON data
def remove_advertisement(obj):
    if isinstance(obj, str):
        return obj.replace(" ADVERTISEMENT", "")
    elif isinstance(obj, list):
        return tuple(remove_advertisement(item) for item in obj)  # Convert list to tuple
    elif isinstance(obj, dict):
        return {key: remove_advertisement(value) for key, value in obj.items()}
    else:
        return obj

modified_data1 = remove_advertisement(data1)
modified_data2 = remove_advertisement(data2)
modified_data3 = remove_advertisement(data3)

# Load the modified json file
df1 = pd.DataFrame(modified_data1)
df2 = pd.DataFrame(modified_data2)
df3 = pd.DataFrame(modified_data3)
df1 = df1.transpose()
df2 = df2.transpose()
df3 = df3.transpose()
df = pd.concat([df1, df2, df3])
df.reset_index(drop=True, inplace=True)

# Remove duplicates and null
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

df.head()

Unnamed: 0,title,ingredients,instructions,picture_link
0,Slow Cooker Chicken and Dumplings,"(4 skinless, boneless chicken breast halves, 2...","Place the chicken, butter, soup, and onion in ...",55lznCYBbs2mT8BTx6BTkLhynGHzM.S
1,Awesome Slow Cooker Pot Roast,(2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",QyrvGdGNMBA2lDdciY0FjKu.77MM0Oe
2,Brown Sugar Meatloaf,"(1/2 cup packed brown sugar, 1/2 cup ketchup, ...",Preheat oven to 350 degrees F (175 degrees C)....,LVW1DI0vtlCrpAhNSEQysE9i/7rJG56
3,Best Chocolate Chip Cookies,"(1 cup butter, softened, 1 cup white sugar, 1 ...",Preheat oven to 350 degrees F (175 degrees C)....,0SO5kdWOV94j6EfAVwMMYRM3yNN8eRi
4,Homemade Mac and Cheese Casserole,"(8 ounces whole wheat rotini pasta, 3 cups fre...",Preheat oven to 350 degrees F. Line a 2-quart ...,YCnbhplMgiraW4rUXcybgSEZinSgljm


In [None]:
print(df.isna().sum())
print(df.duplicated().sum())

title           0
ingredients     0
instructions    0
picture_link    0
dtype: int64
0


In [None]:
# Remove advertisements
def remove_advertisement(ingredients):
    # Remove 'ADVERTISEMENT' from each string in the tuple
    cleaned_ingredients = tuple(ingredient.replace('ADVERTISEMENT', ' ') for ingredient in ingredients)
    return cleaned_ingredients

# Apply the function to the 'ingredients' column
df['ingredients'] = df['ingredients'].apply(remove_advertisement)

# Extract relevant columns, combine them, and lemmatize them

### Create Cooking Time Column

In [None]:
# Extract cooking time from the instructions
def extract_cooking_time(text):
    matches = re.findall(r'\b\d+(?:-\d+)?\s*(?:minutes?|mins?|hours?|hrs?|days?)\b', text)
    return ", ".join(matches)

df['cooking_time'] = df['instructions'].apply(extract_cooking_time)

### Save df to import into app.py

In [None]:
# Save combined_text to a file
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)

### Preprocess text using lemmatization and spacy **(don't need to run again)**

In [None]:
# Preprocessing function to lemmatize data
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

In [None]:
# Apply preprocessing to instructions and ingredients
df['instructions'] = df['instructions'].apply(preprocess_text)
df['ingredients'] = df['ingredients'].apply(lambda x: [preprocess_text(ingredient) for ingredient in x])

# Combine our text data
combined_text = df['instructions'] + " " + df['ingredients'].apply(lambda x: " ".join(x)) + " " + df['cooking_time']

### Save combined text to avoid running code again and to import into app.py **(don't need to run again)**

In [None]:
# Save combined_text to a file
with open('combined_text.pkl', 'wb') as f:
    pickle.dump(combined_text, f)

### Load combined text for future runs

In [None]:
# Load combined_text from the file
with open('combined_text.pkl', 'rb') as f:
    combined_text = pickle.load(f)

# Model Building
## Parameter Optimization **(don't need to run again)**
Only need to run once to get the ideal parameters

In [None]:
# Define TF-IDF parameters grid
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'svd__n_components': [50, 100, 200]  # Truncated SVD parameters
}

# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD())
])

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(combined_text)

# Get best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Apply best parameters to the pipeline
pipeline.set_params(**best_params)

# Fit the pipeline
pipeline.fit(combined_text)

# Extract the TF-IDF matrix and Truncated SVD matrix
tfidf_matrix = pipeline.named_steps['tfidf'].transform(combined_text)
svd_matrix = pipeline.named_steps['svd'].transform(tfidf_matrix)

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 980, in _score
    scores = scorer(estimator, X_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

 nan nan nan nan nan nan nan nan nan]


Best Parameters: {'svd__n_components': 50, 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}


## Fitting model on data
Use optimal parameters result from GridSearchCV in above code

In [None]:
# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(combined_text)

# Dimensionality Reduction using TruncatedSVD
svd = TruncatedSVD(n_components=50)
svd_matrix = svd.fit_transform(tfidf_matrix)

In [None]:
# Cosine Similarity Calculation
def get_recipe_recommendations(query, tfidf_vectorizer, svd_model, data):
    # Preprocess the query
    query = preprocess_text(query)
    query_vector = tfidf_vectorizer.transform([query])
    query_svd = svd_model.transform(query_vector)

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(query_svd, svd_matrix)

    # Get index of the most similar recipe
    top_recipe_index = np.argmax(similarity_scores)

    # Return the most similar recipe
    return data.iloc[top_recipe_index]

## Final Model Output

In [None]:
# Test the recommender system
test_question = "Quick dinner recipe with chicken, broccoli, and rice under 30 minutes."
recommended_recipe = get_recipe_recommendations(test_question, tfidf, svd, df)

print("\033[1mQuestion:\033[0m", test_question)
print("\n\033[1mTitle:\033[0m", recommended_recipe['title'])
print("\033[1mIngredients:\033[0m\n", recommended_recipe['ingredients'])
print("\033[1mInstructions:\033[0m\n", recommended_recipe['instructions'])
print("\033[1mTime Duration:\033[0m\n", recommended_recipe['cooking_time'])

[1mQuestion:[0m Quick dinner recipe with chicken, broccoli, and rice under 30 minutes.

[1mTitle:[0m Garlic Chicken, Vegetable and Rice Skillet
[1mIngredients:[0m
 ('Vegetable cooking spray', '1 1/4 pounds skinless, boneless chicken breast halves', '2 cloves garlic, minced', '1 3/4 cups Swanson® Chicken Broth or Swanson® Chicken Stock', '3/4 cup uncooked white rice', '1 (16 ounce) package frozen vegetable combination (broccoli, cauliflower, carrots)', '1/3 cup grated Parmesan cheese', 'Paprika', 'ADVERTISEMENT')
[1mInstructions:[0m
 Spray a 12-inch skillet with the cooking spray and heat over medium-high heat for 1 minute. Add the chicken and garlic and cook for 10 minutes or until the chicken is well browned on both sides. Remove the chicken from the skillet.
Stir the broth, rice and vegetables in the skillet and heat to a boil. Reduce the heat to low. Cover and cook for 15 minutes. Stir in the cheese.
Return the chicken to the skillet. Sprinkle the chicken with the paprika. C

# Using Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
warnings.simplefilter("ignore")

# Preprocessing function
def preprocess_text_w2v(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove punctuation
    tokens = [word.lower() for word in tokens if word.isalnum()]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Preprocess recipe texts
df['preprocessed_text'] = combined_text.apply(preprocess_text_w2v)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['preprocessed_text'], vector_size=100, window=5, min_count=1, workers=4)

# Function to generate recipe embeddings
def generate_recipe_embedding(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Generate recipe embeddings for all recipes
df['recipe_embedding'] = df['preprocessed_text'].apply(lambda x: generate_recipe_embedding(x, word2vec_model))

# Function to recommend recipe based on user input
def recommend_recipe(user_input, df, model):
    user_tokens = preprocess_text_w2v(user_input)
    user_embedding = generate_recipe_embedding(user_tokens, model)
    if np.any(user_embedding):
        similarities = df['recipe_embedding'].apply(lambda x: np.dot(x, user_embedding) / (np.linalg.norm(x) * np.linalg.norm(user_embedding)))
        max_index = similarities.idxmax()
        return df.loc[max_index]
    else:
        return "Unable to process input. Please provide valid text."

## Final Model Output

In [None]:
# Example usage:
test_question = "Quick dinner recipe with chicken, broccoli, and rice under 30 minutes."
recommended_recipe = recommend_recipe(test_question, df, word2vec_model)

print("\033[1mQuestion:\033[0m", test_question)
print("\n\033[1mTitle:\033[0m", recommended_recipe['title'])
print("\033[1mIngredients:\033[0m\n", recommended_recipe['ingredients'])
print("\033[1mInstructions:\033[0m\n", recommended_recipe['instructions'])
print("\033[1mTime Duration:\033[0m\n", recommended_recipe['cooking_time'])

[1mQuestion:[0m Quick dinner recipe with chicken, broccoli, and rice under 30 minutes.

[1mTitle:[0m Country Chicken and Rice Soup
[1mIngredients:[0m
 ('2 containers Minute® Ready to Serve White Rice', '1 cup (6oz.) cooked, diced chicken breast', '2 cups low-sodium chicken broth', '1 cup frozen mixed vegetables')
[1mInstructions:[0m
 Heat rice according to package directions. In medium microwave-safe bowl, combine chicken, broth and vegetables. Microwave on high for 5 minutes. Stir in rice.
Cooks' notes:
You can adjust the amount of ingredients based on your tastes. Save leftovers for lunch the next day!
[1mTime Duration:[0m
 5 minutes


# Play around with TFIDF SVD Model vs Word2Vec Model Outputs

In [None]:
test_question = input("What do you want to make?")

recommended_recipe_tfidf_svd = get_recipe_recommendations(test_question, tfidf, svd, df)
print("TFIDF SVD Output:")
print("\033[1mQuestion:\033[0m", test_question)
print("\n\033[1mTitle:\033[0m", recommended_recipe_tfidf_svd['title'])
print("\033[1mIngredients:\033[0m\n", recommended_recipe_tfidf_svd['ingredients'])
print("\033[1mInstructions:\033[0m", recommended_recipe_tfidf_svd['instructions'])
print("\033[1mTime Duration:\033[0m\n", recommended_recipe_tfidf_svd['cooking_time'])

recommended_recipe_w2v = recommend_recipe(test_question, df, word2vec_model)
print("\nW2V Output:")
print("\033[1mQuestion:\033[0m", test_question)
print("\n\033[1mTitle:\033[0m", recommended_recipe_w2v['title'])
print("\033[1mIngredients:\033[0m\n", recommended_recipe_w2v['ingredients'])
print("\033[1mInstructions:\033[0m", recommended_recipe_w2v['instructions'])
print("\033[1mTime Duration:\033[0m\n", recommended_recipe_w2v['cooking_time'])

TFIDF SVD Output:
[1mQuestion:[0m bacon, eggs, ham, cheese

[1mTitle:[0m Hash Brown Potato Pie
[1mIngredients:[0m
 ('3 cups Ore-Ida® Shredded Hash Brown Potatoes', '5 eggs', '1/2 cup milk', '1/3 cup green onions, sliced', '1/2 teaspoon salt', '1/4 teaspoon hot sauce', '1 1/2 cups shredded sharp Cheddar cheese, divided', '4 slices bacon, cooked crisp and crumbled, divided', 'ADVERTISEMENT')
[1mInstructions:[0m Heat oven to 350 degrees F.
Beat together eggs and milk in a medium bowl; stir in potatoes, green onions, salt and pepper sauce. Stir in 1 cup of the cheese and half of bacon. Pour into greased 9-inch pie plate or quiche dish.
Bake 25-30 minutes or until center is set. Sprinkle remaining bacon and 1/2 cup cheese over top of pie; continue baking 3 to 4 minutes or until cheese is melted.

[1mTime Duration:[0m
 25-30 minutes, 4 minutes

W2V Output:
[1mQuestion:[0m bacon, eggs, ham, cheese

[1mTitle:[0m BBQ Turkey Sandwiches
[1mIngredients:[0m
 ('4 slices rustic Italia

In [None]:
st.title("Recipe Recommender System")
st.write("Welcome to the Recipe Recommender System! This system is designed to help you find the perfect recipe for your next meal. Whether you're looking for a quick dinner recipe or a healthy breakfast idea, we've got you covered!")
st.write("To get started, simply type in the ingredients you have and the amount of time you have to cook. We'll take care of the rest!")


# Script to Run Streamlit Front-End Interface

In [None]:
import streamlit as st
import pickle
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Load the df
with open('df.pkl', 'rb') as f:
    df = pickle.load(f)

# Load the combined_text (preprocessed text data)
with open('combined_text.pkl', 'rb') as f:
    combined_text = pickle.load(f)

# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(combined_text)

# Dimensionality Reduction using TruncatedSVD
svd = TruncatedSVD(n_components=50)
svd_matrix = svd.fit_transform(tfidf_matrix)

# Function to preprocess text
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to make predictions
def get_recipe_recommendations(query, tfidf_vectorizer, svd_model, data):
    # Preprocess the query
    query = preprocess_text(query)
    query_vector = tfidf_vectorizer.transform([query])
    query_svd = svd_model.transform(query_vector)
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(query_svd, svd_matrix)
    # Get index of the most similar recipe
    top_recipe_index = np.argmax(similarity_scores)
    # Return the most similar recipe
    return data.iloc[top_recipe_index]

# Streamlit app layout
st.title("Recipe Recommender")

# User input
user_input = st.text_area("What do you want to make?")

# Recommendation
if st.button("Get Recommendation"):
    if user_input:
        recommended_recipe = get_recipe_recommendations(user_input, tfidf, svd, df)
        st.write("Recommended Recipe:")
        st.write("Title:", recommended_recipe['title'])
        st.write("Ingredients:", recommended_recipe['ingredients'])
        st.write("Instructions:", recommended_recipe['instructions'])
        st.write("Cooking Time:", recommended_recipe['cooking_time'])
    else:
        st.warning("Please enter a question.")