# Creating the Recommender Model Using Cosine Similarity

In [1]:
#Libraries for general analysis and data set manipulation:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
#Importing libraries to manage data size and to build the recommender:
import sys
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [3]:
#Reading in the data to a dataframe called df_recs:
df_recs = pd.read_csv('./data/df_recs_clean_final.csv')

In [8]:
#Inspecting the head of the dataframe:
df_recs.head()

Unnamed: 0,url,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,...,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
1,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,...,0,0,0,0,0,0,0,0,0,0
2,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
3,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,...,0,0,0,0,1,0,0,0,0,0
4,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,...,0,0,0,0,1,0,0,0,0,0


In [5]:
#Looking at size of df_recs: (unit:bytes - mult by 1e-9 to get to GB - Douglas Strodtman)
sys.getsizeof(df_recs)
#Source: Riley Dallas (General Assembly Lesson 8.05)

66005398

In [6]:
#It looks like typically the recipe title is in the url, so recipe titles will be removed
df_recs.drop(labels = 'title',
             axis = 1,
             inplace = True)

In [9]:
#Checking work:
df_recs.head()

Unnamed: 0,url,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,...,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
1,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,...,0,0,0,0,0,0,0,0,0,0
2,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
3,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,...,0,0,0,0,1,0,0,0,0,0
4,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,...,0,0,0,0,1,0,0,0,0,0


In [10]:
#The url will be used as the index because the url is different (unique) for each recipe  
df_recs.set_index('url', inplace = True)

In [11]:
#Checking work:
df_recs.head()

Unnamed: 0_level_0,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,...,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,...,0,0,1,0,0,0,0,0,1,0
http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,...,0,0,0,0,0,0,0,0,0,0
http://www.saveur.com/article/Recipes/Negroni-Cocktail,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,...,0,0,1,0,0,0,0,0,1,0
http://food52.com/recipes/27825-simple-fresh-pasta,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,...,0,0,0,0,1,0,0,0,0,0
http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,...,0,0,0,0,1,0,0,0,0,0


In [12]:
#Looking at datatypes to make sure only numerical data is in df_recs:
df_recs.dtypes.value_counts()

int64      301
float64     34
dtype: int64

In [13]:
#Looking at the size of df_recs: (unit: bytes)
sys.getsizeof(df_recs)

64212225

In [14]:
#Checking the first five rows of df_recs:
df_recs.head()

Unnamed: 0_level_0,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,...,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,...,0,0,1,0,0,0,0,0,1,0
http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,...,0,0,0,0,0,0,0,0,0,0
http://www.saveur.com/article/Recipes/Negroni-Cocktail,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,...,0,0,1,0,0,0,0,0,1,0
http://food52.com/recipes/27825-simple-fresh-pasta,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,...,0,0,0,0,1,0,0,0,0,0
http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,...,0,0,0,0,1,0,0,0,0,0


In [15]:
standard_scaler = StandardScaler()

In [16]:
df_recs_scaled = standard_scaler.fit_transform(df_recs)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
df_recs_sc = pd.DataFrame(df_recs_scaled, columns = df_recs.columns, index = df_recs.index)

In [18]:
df_recs_sc.head()

Unnamed: 0_level_0,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,...,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,-0.564033,-0.559532,-0.203759,-0.914439,-1.040374,-0.228105,-0.786377,-0.660801,-0.390411,-0.528507,...,-1.139301,-0.452991,3.974203,-0.159047,-1.707377,-0.351998,-0.014765,-0.50339,5.193922,-0.248375
http://www.marthastewart.com/337857/pasta-dough,-0.091116,-0.386535,-0.024473,-0.407312,-0.82903,-0.040543,-0.210685,-0.222601,-0.390411,-0.520866,...,-1.139301,-0.452991,-0.251623,-0.159047,-1.707377,-0.351998,-0.014765,-0.50339,-0.192533,-0.248375
http://www.saveur.com/article/Recipes/Negroni-Cocktail,-0.564033,-0.559726,-0.203759,-0.924682,-1.041433,-0.228105,-0.786377,-0.660801,-0.390411,-0.528507,...,-1.139301,-0.452991,3.974203,-0.159047,-1.707377,-0.351998,-0.014765,-0.50339,5.193922,-0.248375
http://food52.com/recipes/27825-simple-fresh-pasta,-0.091116,-0.437099,-0.203759,-0.666293,-0.921323,-0.228105,-0.720165,-0.553375,-0.38171,-0.52573,...,-1.139301,-0.452991,-0.251623,-0.159047,0.034006,-0.351998,-0.014765,-0.50339,-0.192533,-0.248375
http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,-0.091116,-0.40113,-0.203759,-0.507564,-0.85567,-0.228105,-0.672413,-0.512736,-0.385423,-0.526011,...,-1.139301,-0.452991,-0.251623,-0.159047,0.034006,-0.351998,-0.014765,-0.50339,-0.192533,-0.248375


In [19]:
#Looking at size of df_recs: (unit:bytes - mult by 1e-9 to get to GB - Douglas Strodtman)
sys.getsizeof(df_recs_sc)

64212225

In [20]:
df_recs_sparse = sparse.csr_matrix(df_recs_sc)
#Sources: 
#Riley Dallas (General Assembly Lesson 8.05)
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

In [21]:
#Looking at size of df_recs: (unit:bytes - mult by 1e-9 to get to GB - Douglas Strodtman)
sys.getsizeof(df_recs_sparse)

56

In [22]:
df_recs.shape

(22941, 335)

In [23]:
df_recs_sparse.shape

(22941, 335)

In [24]:
type(df_recs_sparse)

scipy.sparse.csr.csr_matrix

In [25]:
#print(df_recs_sparse[:5,:]) #looking at whether empty columns

Sklearn has a built-in pairwise_distances function that can be used for the recommender. It will return a square matrix comparing every recipe with every other recipe in the dataset.

In [26]:
#Calculating cosine similarity using sklearn's built-in pairwise distance function:
#recommender = pairwise_distances(df_recs_sparse, metric = 'cosine')
#recommender.shape

In [28]:
cos_sim_matrix = cosine_similarity(df_recs_sparse, df_recs_sparse)

In [29]:
#Placing the cosine similarity index into a pandas dataframe for increased ease of use:
recommender = pd.DataFrame(cos_sim_matrix,
                           columns = df_recs_sc.index, 
                           index = df_recs_sc.index)

In [30]:
recommender.head()

url,http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,http://www.marthastewart.com/337857/pasta-dough,http://www.saveur.com/article/Recipes/Negroni-Cocktail,http://food52.com/recipes/27825-simple-fresh-pasta,http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,http://www.seriouseats.com/recipes/2011/02/time-for-a-drink-pisco-sour.html,http://www.marthastewart.com/354987/fresh-pasta,https://food52.com/recipes/10620-cauliflower-popcorn,http://www.saveur.com/article/Recipes/Manhattan-1000088919,http://www.seriouseats.com/recipes/2011/11/how-to-make-mulled-cider.html,...,http://www.recipezaar.com/Old-fashioned-Linguine-with-White-Clam-Sauce-17722,http://www.food.com/recipe/seafood-casserole-for-2-252748,http://www.myrecipes.com/recipe/chicken-dumplings-13,http://www.delish.com/cooking/recipe-ideas/recipes/a31279/greek-chicken-pitas-recipe-ghk0411/,https://www.foodnetwork.com/recipes/anne-burrell/linguine-with-white-clam-sauce-recipe-1919333,http://www.delish.com/cooking/recipe-ideas/recipes/a30834/quick-mu-shu-pork-recipe-ghk0910/,http://www.food.com/recipe/red-lobster-tartar-sauce-335329,http://www.foodista.com/recipe/F3RYN6B2/healthy-hazelnut-cookies,https://www.allrecipes.com/recipe/22188/yeast-doughnuts/,http://www.delish.com/cooking/recipe-ideas/recipes/a26274/spicy-southern-fried-chicken-3300/
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,1.0,0.196066,0.810089,0.316115,0.284985,0.776981,0.35953,0.255409,0.779501,0.481346,...,-0.077149,-0.118894,-0.129485,-0.091884,0.095813,-0.007959,0.145449,-0.149429,-0.08119,-0.082039
http://www.marthastewart.com/337857/pasta-dough,0.196066,1.0,0.147577,0.438968,0.41919,0.121983,0.720198,0.139687,0.136293,0.063641,...,-0.028069,-0.086442,-0.090279,-0.108408,0.056024,-0.062388,0.050836,-0.14676,0.039305,-0.030293
http://www.saveur.com/article/Recipes/Negroni-Cocktail,0.810089,0.147577,1.0,0.267127,0.238482,0.619307,0.308634,0.250212,0.963219,0.239681,...,-0.100467,-0.11909,-0.103832,-0.092581,0.061392,-0.008735,0.140994,-0.140123,-0.068749,-0.070368
http://food52.com/recipes/27825-simple-fresh-pasta,0.316115,0.438968,0.267127,1.0,0.659965,0.237669,0.704346,0.413011,0.251794,0.223517,...,-0.067304,-0.129951,-0.114661,-0.116044,0.041238,-0.071753,0.149251,-0.241243,-0.001491,-0.065068
http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,0.284985,0.41919,0.238482,0.659965,1.0,0.21305,0.666991,0.170004,0.224605,0.191287,...,-0.058284,-0.121972,-0.110462,-0.112491,0.034147,-0.063314,0.118777,-0.192723,0.010705,-0.057352


In [31]:
#Saving the recommender to a csv so it can be loaded easily later on:
#recommender.to_csv('./data/recommender.csv')

In [33]:
#Looking at size of df_recs: (unit:bytes)
sys.getsizeof(recommender)
#Source: Riley Dallas (General Assembly Lesson 8.05)

4213701553

In [35]:
#Looking at datatypes to make sure only numerical data is in df_recs:
recommender.dtypes.value_counts()

float64    22941
dtype: int64

In [42]:
recommender_float16 = recommender.astype('float16')
#Sources:
#https://medium.com/@vincentteyssier/optimizing-the-size-of-a-pandas-dataframe-for-low-memory-environment-5f07db3d72e
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html
#https://www.dataquest.io/blog/pandas-big-data/

In [49]:
recommender_float16.head()

url,http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,http://www.marthastewart.com/337857/pasta-dough,http://www.saveur.com/article/Recipes/Negroni-Cocktail,http://food52.com/recipes/27825-simple-fresh-pasta,http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,http://www.seriouseats.com/recipes/2011/02/time-for-a-drink-pisco-sour.html,http://www.marthastewart.com/354987/fresh-pasta,https://food52.com/recipes/10620-cauliflower-popcorn,http://www.saveur.com/article/Recipes/Manhattan-1000088919,http://www.seriouseats.com/recipes/2011/11/how-to-make-mulled-cider.html,...,http://www.recipezaar.com/Old-fashioned-Linguine-with-White-Clam-Sauce-17722,http://www.food.com/recipe/seafood-casserole-for-2-252748,http://www.myrecipes.com/recipe/chicken-dumplings-13,http://www.delish.com/cooking/recipe-ideas/recipes/a31279/greek-chicken-pitas-recipe-ghk0411/,https://www.foodnetwork.com/recipes/anne-burrell/linguine-with-white-clam-sauce-recipe-1919333,http://www.delish.com/cooking/recipe-ideas/recipes/a30834/quick-mu-shu-pork-recipe-ghk0910/,http://www.food.com/recipe/red-lobster-tartar-sauce-335329,http://www.foodista.com/recipe/F3RYN6B2/healthy-hazelnut-cookies,https://www.allrecipes.com/recipe/22188/yeast-doughnuts/,http://www.delish.com/cooking/recipe-ideas/recipes/a26274/spicy-southern-fried-chicken-3300/
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.seriouseats.com/recipes/2010/06/the-martini-recipe.html,1.0,0.196045,0.810059,0.316162,0.284912,0.776855,0.359619,0.255371,0.779297,0.481445,...,-0.077148,-0.118896,-0.129517,-0.091858,0.095825,-0.007957,0.145508,-0.149414,-0.081177,-0.082031
http://www.marthastewart.com/337857/pasta-dough,0.196045,1.0,0.147583,0.438965,0.419189,0.122009,0.720215,0.139648,0.136353,0.06366,...,-0.028076,-0.086426,-0.090271,-0.108398,0.05603,-0.062378,0.050842,-0.146729,0.039307,-0.030289
http://www.saveur.com/article/Recipes/Negroni-Cocktail,0.810059,0.147583,1.0,0.26709,0.238525,0.619141,0.308594,0.250244,0.963379,0.239624,...,-0.100464,-0.11908,-0.103821,-0.09259,0.061401,-0.008736,0.140991,-0.140137,-0.068726,-0.070374
http://food52.com/recipes/27825-simple-fresh-pasta,0.316162,0.438965,0.26709,1.0,0.660156,0.237671,0.704102,0.413086,0.251709,0.223511,...,-0.067322,-0.130005,-0.114685,-0.116028,0.041229,-0.071777,0.149292,-0.241211,-0.001492,-0.065063
http://www.epicurious.com/recipes/food/views/Egg-Noodle-351712,0.284912,0.419189,0.238525,0.660156,1.0,0.213013,0.666992,0.170044,0.224609,0.191284,...,-0.058289,-0.121948,-0.110474,-0.112488,0.034149,-0.063293,0.118774,-0.192749,0.010704,-0.057343


In [44]:
#Looking at datatypes to make sure only numerical data is in df_recs:
recommender_float16.dtypes.value_counts()

float16    22941
dtype: int64

In [46]:
#Looking at size of df_recs: (unit:bytes)
sys.getsizeof(recommender_float16)
#Source: Riley Dallas (General Assembly Lesson 8.05)

1055964667

In [51]:
#Savind the recommender to a csv so it can be uploaded into the next notebook (and for future use):
recommender_float16.to_csv('./data/recommender_final.csv')