In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
# load df_main_clean.csv
df = pd.read_csv('./data/df_main_clean.csv')

In [4]:
df_ingr = df['ingredients']
# concat recipe_name, category and ingredient columns
df_ingr = df_ingr.str.cat(df[['recipe_name', 'category']], sep=' ')

In [5]:
def text_clean(df):
    # remove each word with 'ed' in the end for example 'fried', 'sliced', 'chopped' remove them
    df = df.apply(lambda x: re.sub(r'\b\w+ed\b', '', x))
    # replace comma with space
    df = df.apply(lambda x: re.sub(r',', ' ', x))

    # remove punctuation
    df = df.apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # remove unwanted characters and symbols
    df = df.apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x))

    # convert to lowercase
    df = df.apply(lambda x: x.lower())

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    df = df.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    # Tokenize ingredients into a list of individual words
    df = df.apply(lambda x: word_tokenize(x)) 
    
    return df

df_ingr = text_clean(df_ingr)

In [6]:
df_ingr.head()

0    [2, 105, ounce, cans, cream, chicken, soup, 3,...
1    [2, tablespoons, vegetable, oil, 1, pound, bee...
2    [2, teaspoons, olive, oil, 2, whole, rainbow, ...
3    [4, hot, dog, buns, 4, hot, dogs, basic, air, ...
4    [2, pounds, haddock, fillets, 05, teaspoon, on...
Name: ingredients, dtype: object

In [7]:
model = Word2Vec(df_ingr, min_count=1, vector_size=100, window=5, workers=4)

In [8]:
model.wv.most_similar('chicken')

[('lowsodium', 0.6680896282196045),
 ('beef', 0.6090837121009827),
 ('skinless', 0.5683826804161072),
 ('turkey', 0.5571762919425964),
 ('imagine', 0.5415967106819153),
 ('breast', 0.5248659253120422),
 ('quinoa', 0.5022087097167969),
 ('swanson', 0.496742308139801),
 ('mushroom', 0.4914121627807617),
 ('sodium', 0.4885903000831604)]

In [9]:
# import numpy as np

def calculate_average_vector(tokens, model):
    token_vectors = []
    for token in tokens:
        try:
            token_vectors.append(model.wv[token])
        except KeyError:
            pass
    return np.mean(token_vectors, axis=0)

df_ingr = df_ingr.apply(lambda x: calculate_average_vector(x, model))

df_ingr.head()

0    [1.0678786, -0.40851134, -0.20696071, 0.537388...
1    [0.7485664, -1.2707795, -1.3593858, 0.25103697...
2    [0.5724781, -0.7109995, -1.1411778, -0.0541531...
3    [0.061488416, -0.15301207, 0.19337137, -0.0362...
4    [0.49323934, -0.75901234, -1.5818995, -0.12899...
Name: ingredients, dtype: object

In [10]:
df.columns

Index(['recipe_id', 'recipe_name', 'category', 'ratings', 'description',
       'servings', 'total_time', 'ingredients', 'calories_kcal',
       'carbohydrateContent', 'cholesterolContent', 'fiberContent',
       'proteinContent', 'saturatedFatContent', 'sodiumContent',
       'sugarContent', 'fatContent', 'unsaturatedFatContent', 'servingSize'],
      dtype='object')

In [11]:
df_main = df[['calories_kcal', 'carbohydrateContent', 'proteinContent', 'fatContent', 'servings']]

In [12]:
# add df_ingr to df_main
df_main = pd.concat([df_main, df_ingr], axis=1)

In [13]:
df_main.head()

Unnamed: 0,calories_kcal,carbohydrateContent,proteinContent,fatContent,servings,ingredients
0,400,36.0,22.0,18.0,8,"[1.0678786, -0.40851134, -0.20696071, 0.537388..."
1,268,9.0,23.0,16.0,4,"[0.7485664, -1.2707795, -1.3593858, 0.25103697..."
2,373,2.0,54.0,15.0,6,"[0.5724781, -0.7109995, -1.1411778, -0.0541531..."
3,269,23.0,9.0,15.0,4,"[0.061488416, -0.15301207, 0.19337137, -0.0362..."
4,229,1.0,43.0,5.0,4,"[0.49323934, -0.75901234, -1.5818995, -0.12899..."


In [16]:
ingredients = df_main['ingredients'].values
servings = df_main['servings'].values

# convert ingredients to a 2D array
ingredients = np.vstack(ingredients)

X = np.hstack((ingredients, servings.reshape(-1, 1)))
y = df_main.drop(['ingredients', 'servings'], axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
# create pipeline for linear regression, random forest and decision tree use grid search
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

pipe_lr = Pipeline([('lr', LinearRegression())])
pipe_rf = Pipeline([('rf', RandomForestRegressor())])
pipe_dt = Pipeline([('dt', DecisionTreeRegressor())])

# create param grid for each pipeline
param_grid_lr = [{'lr__fit_intercept': [True, False]}]
param_grid_rf = [{'rf__n_estimators': [100, 200, 300]}]
param_grid_dt = [{'dt__max_depth': [3, 5, 7]}]

# create list of pipelines
pipelines = [pipe_lr, pipe_rf, pipe_dt]

# create list of param grids
param_grids = [param_grid_lr, param_grid_rf, param_grid_dt]

# create list of model names
model_names = ['Linear Regression', 'Random Forest', 'Decision Tree']

#predict
for i in range(len(pipelines)):
    pipelines[i].fit(X_train, y_train)
    print(model_names[i])
    print(pipelines[i].score(X_test, y_test))
    print()

Linear Regression
0.33778189752157717

Random Forest
0.31086356590410225

Decision Tree
-0.45108159128928993



In [82]:
def get_ingredients(ingredients, servings):
    my_ingredients = ingredients
    my_ingredients = pd.Series(my_ingredients)
    my_ingredients = text_clean(my_ingredients).iloc[0]
    my_ingredients = calculate_average_vector(my_ingredients, model)
    my_servings = servings
    my_ingredients = np.hstack((my_ingredients, my_servings)).reshape(1, -1)
    return my_ingredients

In [79]:
# predict
for i in range(len(pipelines)):
    print(model_names[i])
    print(pipelines[i].predict(my_ingredients))
    print()

Linear Regression
[[382.74524387  41.61143635  16.60374451  17.20822138]]

Random Forest
[[343.24  27.66  21.49  16.59]]

Decision Tree
[[512.  36.  16.  34.]]



In [80]:
# grid search pipeline
from sklearn.model_selection import GridSearchCV

# create list of grids
grids = []

# gridsearch for each pipeline
for i in range(len(pipelines)):
    grid = GridSearchCV(pipelines[i], param_grids[i], cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    grids.append(grid)

# predict
for i in range(len(grids)):
    print(model_names[i])
    print(grids[i].score(X_test, y_test))
    print()

Linear Regression
0.33778189752157717

Random Forest
0.3079114612483292

Decision Tree
0.12093066400266625



In [81]:
#predict
for i in range(len(grids)):
    print(model_names[i])
    print(grids[i].predict(my_ingredients))
    print()

Linear Regression
[[382.74524387  41.61143635  16.60374451  17.20822138]]

Random Forest
[[363.23666667  23.86        23.38666667  20.12666667]]

Decision Tree
[[434.02337814  35.93863238  21.70485096  23.05990649]]



In [84]:
def predict(my_ingredients):
    for i in range(len(grids)):
        print(model_names[i])
        print(grids[i].predict(my_ingredients))
        print()

In [91]:
ingredients = '2 bunches collard greens,2 bunches mustard greens,2 bunches turnip greens,4 cups chicken broth, divided,3 tablespoons salt,½ cup vegetable oil,½ cup cooked real bacon bits,½ cup white sugar,4 cloves garlic, sliced,salt and pepper to taste'
predict(get_ingredients(ingredients, 12))

Linear Regression
[[475.33760584  38.47506537  22.40375042  25.53963892]]

Random Forest
[[379.97166667  30.18833333  19.16666667  20.38666667]]

Decision Tree
[[337.90463576  34.09471788  13.67516556  16.74470199]]



In [92]:
# get name of top 5 similar recipes based on ingredients return recipe name using cosine similarity
def get_similar_recipes(ingredients, servings):
    my_ingredients = ingredients
    my_ingredients = pd.Series(my_ingredients)
    my_ingredients = text_clean(my_ingredients).iloc[0]
    my_ingredients = calculate_average_vector(my_ingredients, model)
    my_servings = servings
    my_ingredients = np.hstack((my_ingredients, my_servings)).reshape(1, -1)
    my_ingredients = grids[0].best_estimator_.predict(my_ingredients)
    my_ingredients = my_ingredients.reshape(1, -1)
    return my_ingredients


In [93]:
get_similar_recipes(ingredients, 12)

array([[475.33760584,  38.47506537,  22.40375042,  25.53963892]])

In [95]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the index of the input recipe
input_recipe_index = df_main[df_main['recipe_name'] == 'Indian Chicken Curry (Murgh Kari)'].index[0]

# Get the vector representation of the input recipe's ingredients
input_recipe_vector = df_main.iloc[input_recipe_index]['ingredients']

# Calculate cosine similarity between input recipe and all other recipes
cosine_similarities = cosine_similarity(input_recipe_vector.reshape(1, -1), df_main['ingredients'])

# Get the top 5 most similar recipes
similar_recipe_indices = cosine_similarities.argsort()[0][-6:-1]  # Excluding the input recipe itself
similar_recipe_names = df_main.iloc[similar_recipe_indices]['recipe_name']

print("Top 5 similar recipes:")
print(similar_recipe_names)

KeyError: 'recipe_name'