# Data Merging, Cleaning, and Some EDA  
In this notebook, I merged data, cleaned data, created dummy matrices, explored the data, and merged the final recipes dataframe.

In [1]:
#Libraries for general analysis and data set manipulation:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
#Importing glob, a useful module to find all files following a specified pattern in a directory: 
import glob

#Sources: 
#https://docs.python.org/2/library/glob.html
#Roy Kim helped with use of glob as well

In [3]:
#Importing libraries for natural language processing of recipe titles (if I get to it):

#Importing regex:
import regex as re

#Importing WordNetLemmatizer:
from nltk.stem import WordNetLemmatizer

#Instantiating WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()

#Importing stopwords (the stop word list)
from nltk.corpus import stopwords

# Importing CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
#Using the glob module to find all csv files saved with names starting with "recipes_dishType" in the data directory:
files = glob.glob('./data/recipes_dishType*.csv')

#Source: https://docs.python.org/2/library/glob.html

In [5]:
#Inspecting the files collected using glob:
files

['./data/recipes_dishType_dinner_NutrientPROCNT_MaxAmt50.csv',
 './data/recipes_dishType_breakfast_NutrientFAPU_MaxAmt67.csv',
 './data/recipes_dishType_breakfast_NutrientPROCNT_MaxAmt50.csv',
 './data/recipes_dishType_dinner_NutrientK_MaxAmt3400.csv',
 './data/recipes_dishType_dinner_NutrientCHOCDF_MaxAmt325.csv',
 './data/recipes_dishType_breakfast_NutrientTOCPHA_MaxAmt19.csv',
 './data/recipes_dishType_dessert_NutrientCHOCDF_MaxAmt325.csv',
 './data/recipes_dishType_nibble_NutrientVITC_MaxAmt125.csv',
 './data/recipes_dishType_dinner_NutrientTOCPHA_MaxAmt19.csv',
 './data/recipes_dishType_lunch_NutrientPROCNT_MaxAmt50.csv',
 './data/recipes_dishType_dessert_NutrientK_MaxAmt3400.csv',
 './data/recipes_dishType_lunch_NutrientFAMS_MaxAmt67.csv',
 './data/recipes_dishType_breakfast_NutrientFASAT_MaxAmt22.csv',
 './data/recipes_dishType_nibble_NutrientENERC_KCAL_MaxAmt3200.csv',
 './data/recipes_dishType_dessert_NutrientVITD_MaxAmt20.csv',
 './data/recipes_dishType_breakfast_NutrientFIBT

In [6]:
#Finding the number of csv files in the data folder:
len(files)

150

In [7]:
#Number of nutrient types inspected multiplied by number of dish types per nutrient type:
30*5

150

There are 150 csv files which makes sense because recipes were collected for 28 nutrients, and two of the nutrients had two query calls instead of just one query call for the nutrients: folate had a folate equivalent query and a folate food query, and sugar had a total sugar query and an added sugar query.

In [8]:
#Instantiating a Pandas dataframe called df_recs:
df_recs = pd.DataFrame()

In [9]:
#Addending recipe data from each csv to the df_recs dataframe using a for loop to loop through each
#file in the files list:
#(Note: ignoring the index of the files to avoid creating an unnamed index column in the df_recs dataframe)
for file in files:
    df_recs = df_recs.append(pd.read_csv(file), ignore_index = True)

In [10]:
#Looking at the dimensions of df_recs:
df_recs.shape

(305542, 74)

There are 305,542 recipes in df_recs, and there are 74 features collected for each recipe.

In [11]:
#Increasing the number of columns that will be displayed when inspecting the head of a dataframe:
pd.set_option('display.max_columns', 75)
#Source: https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns

In [12]:
#Inspecting the first five rows of df_recs:
df_recs.head()

Unnamed: 0,uri,title,image,source,url,share_as,yield,diet_labels,health_labels,cautions,list_ingredients_lines,list_ingredients,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,monounsat_fat_pct,polyunsat_fat_g,polyunsat_fat_pct,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,trans_fat_pct,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,sugar_pct,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct
0,http://www.edamam.com/ontologies/edamam.owl#re...,Pasta Dough,https://www.edamam.com/web-img/da7/da77a3037df...,Ruhlman,http://ruhlman.com/2011/05/how-to-make-pasta/,http://www.edamam.com/recipe/pasta-dough-d88e1...,6.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['9 ounces/255 grams all-purpose flour', '3 eg...",[{'text': '9 ounces/255 grams all-purpose flou...,1113.200378,384.145708,96.0,['Italian'],['pasta'],['dinner'],110.511856,11.051186,195.63049,65.210163,479.88,159.96,1113.200378,55.660019,4.940797,,3.518942,,4.428016,22.140079,14.768328,22.720505,0.04902,,14.096261,78.31256,6.888934,27.555736,803.104011,200.776003,451.025908,9.596296,71.612056,17.050489,188.282914,7.845121,15.160553,94.753454,530.977365,75.853909,42.558952,85.117903,1.84995,142.303831,1.166193,,2.054494,171.207817,206.4,22.933333,0.331564,25.504932,1.1481,47.8375,,,2.58,17.2,1.941335,12.942234,1.152437,0.960364
1,http://www.edamam.com/ontologies/edamam.owl#re...,White Russian,https://www.edamam.com/web-img/fa2/fa21fb6fb9a...,Leite's Culinaria,http://leitesculinaria.com/79362/recipes-white...,http://www.edamam.com/recipe/white-russian-f8d...,1.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",['Wheat'],"['2 ounces vodka', '1 ounce Kahlua', 'Splash o...","[{'text': '2 ounces vodka', 'quantity': 2.0, '...",206.965694,87.600026,0.0,['Russian'],['cocktail'],['dinner'],1.941942,0.194194,7.895654,2.631885,3.495496,1.165165,206.965694,10.348285,0.397954,,0.073896,,2.450499,12.252494,3.098603,4.767081,,,0.01494,0.083001,0.02835,0.113398,1.80303,0.450757,26.010687,0.553419,1.879573,0.447517,21.664706,0.902696,0.030479,0.190491,14.622684,2.088955,0.222402,0.444804,0.008193,0.630232,0.071186,,0.008216,0.684641,10.769984,1.196665,0.005483,0.421754,0.004593,0.191359,0.440552,0.489502,0.01786,0.119068,0.027045,0.180303,0.081647,0.068039
2,http://www.edamam.com/ontologies/edamam.owl#re...,Martini Recipe,https://www.edamam.com/web-img/b06/b06df6af92e...,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,http://www.edamam.com/recipe/martini-recipe-9a...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 ounces dry gin', '1 ounce dry vermouth', '...","[{'text': '2 ounces dry gin', 'quantity': 2.0,...",174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,,,174.357137,8.717857,,,,,,,,,,,0.105125,0.584027,,,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,,0.001452,0.121019,,,0.015315,1.178041,,,,,,,,,,
3,http://www.edamam.com/ontologies/edamam.owl#re...,Martini,https://www.edamam.com/web-img/0ff/0ff27f187ce...,David Lebovitz,http://www.davidlebovitz.com/2015/05/the-marti...,http://www.edamam.com/recipe/martini-a46dc893d...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",['Sulfites'],"['5 ounces gin', '1 ounce dry vermouth', '2 ol...","[{'text': '5 ounces gin', 'quantity': 5.0, 'me...",403.686333,176.497139,0.0,['Italian'],['cocktail'],['dinner'],7.899962,0.789996,1.171747,0.390582,,,403.686333,20.184317,0.504832,,0.058304,,0.09056,0.4528,0.68352,1.051569,,,0.316093,1.756074,0.2048,0.8192,0.283495,0.070874,28.578028,0.608043,3.374448,0.80344,51.292428,2.137185,0.049428,0.308926,5.861905,0.837415,0.073605,0.147209,0.00652,0.501568,0.223961,,0.001609,0.134123,1.28,0.142222,0.015885,1.221903,,,0.0576,0.064,,,0.1056,0.704,0.0896,0.074667
4,http://www.edamam.com/ontologies/edamam.owl#re...,Ham Stock,https://www.edamam.com/web-img/ae0/ae073b47398...,No Recipes,http://norecipes.com/recipe/ham-stock/,http://www.edamam.com/recipe/ham-stock-37b6f29...,9.0,"['Low-Carb', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['1 tablespoon vegetable oil', '250 grams leek...","[{'text': '1 tablespoon vegetable oil', 'quant...",67.6904,2057.06,0.0,['Spanish'],['soup'],['dinner'],68.255,6.8255,1.5462,0.5154,19.52,6.506667,67.6904,3.38452,1.76963,,0.549469,,1.058062,5.290308,3.742,5.756923,0.027975,,0.5044,2.802222,0.18,0.72,6.4,1.6,126.48,2.691064,29.345,6.986905,343.7,14.320833,2.08224,13.014,81.26,11.608571,7.0652,14.1304,0.07276,5.596923,0.4092,,0.14808,12.34,8.94,0.993333,0.16058,12.352308,0.1728,7.2,1.2,1.333333,,,0.31328,2.088533,4.7,3.916667


In [13]:
#Looking at general information about df_recs:
df_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305542 entries, 0 to 305541
Data columns (total 74 columns):
uri                       305542 non-null object
title                     305542 non-null object
image                     305542 non-null object
source                    305542 non-null object
url                       305542 non-null object
share_as                  305542 non-null object
yield                     305542 non-null float64
diet_labels               305542 non-null object
health_labels             305542 non-null object
cautions                  305542 non-null object
list_ingredients_lines    305542 non-null object
list_ingredients          305542 non-null object
calories                  305542 non-null float64
total_weight              305542 non-null float64
total_time                305542 non-null float64
cuisine_type              305542 non-null object
meal_type                 304869 non-null object
dish_type                 305542 non-null object
cal

In [14]:
#Finding the number of columns with each datatype:
df_recs.dtypes.value_counts()

#Source:
#https://stackoverflow.com/questions/32337380/count-data-types-in-pandas-dataframe

float64    60
object     14
dtype: int64

There are 14 object datatype columns and 60 float datatype columns in df_recs.

In [15]:
#Looking at how much memory is used by df_recs:
df_recs.memory_usage().sum()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.memory_usage.html

180880944

The memory usage of df_recs is approximately 181 MB.

There are a lot of repeated recipes in df_recs due to the nature of the queries.  For example, a lot of recipes collected during the query with low levels of calcium likely also were collected during the query for recipes with low levels of vitamin D.  The following code will be used to remove repeated recipes from df_recs and to ensure each recipe in df_recs is a unique recipe.

In [16]:
#Dropping duplicate rows based on URL since the URL is a unique identifier for each recipe:
df_recs.drop_duplicates(subset = 'url', inplace = True)

#Sources:
#http://pandas.pydata.org/pandas-docs/version/0.17/generated/pandas.DataFrame.drop_duplicates.html
#https://jamesrledoux.com/code/drop_duplicates

In [17]:
df_recs.reset_index(drop=True, inplace=True)

In [18]:
#Checking the dimensions of df_recs after duplicate recipes were removed:
df_recs.shape

(42777, 74)

In [19]:
#Looking at how much memory is used by df_recs:
df_recs.memory_usage().sum()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.memory_usage.html

25324064

The memory usage decreased significantly from approx. 181 MB to approx. 26 MB after duplicate recipes were removed.  Unfortunately I thought I had a lot more unique recipes than 42,777 recipes.  In the future, I would like to collect more unique recipes by altering my data collection methodology.

In [26]:
df_recs_top_sources = df_recs['source'].value_counts().iloc[:20].index
df_recs_top_sources

Index(['Allrecipes', 'food.com', 'Food Network', 'Group Recipes',
       'recipezaar.com', 'Taste of Home', 'Martha Stewart', 'BigOven',
       'My Recipes', 'Delish', 'Foodista', 'Food52', 'Epicurious',
       'Williams-Sonoma', 'Serious Eats', 'Cookstr', 'Good Housekeeping',
       'Kraft Foods', 'Saveur', 'Kitchen Daily'],
      dtype='object')

In [30]:
df_recs = df_recs[df_recs['source'].isin(df_recs_top_sources)]
df_recs.head()
#Source:
#https://stackoverflow.com/questions/27965295/dropping-rows-from-dataframe-based-on-a-not-in-condition

Unnamed: 0,uri,title,image,source,url,share_as,yield,diet_labels,health_labels,cautions,list_ingredients_lines,list_ingredients,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,monounsat_fat_pct,polyunsat_fat_g,polyunsat_fat_pct,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,trans_fat_pct,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,sugar_pct,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct
2,http://www.edamam.com/ontologies/edamam.owl#re...,Martini Recipe,https://www.edamam.com/web-img/b06/b06df6af92e...,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,http://www.edamam.com/recipe/martini-recipe-9a...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 ounces dry gin', '1 ounce dry vermouth', '...","[{'text': '2 ounces dry gin', 'quantity': 2.0,...",174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,,,174.357137,8.717857,,,,,,,,,,,0.105125,0.584027,,,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,,0.001452,0.121019,,,0.015315,1.178041,,,,,,,,,,
6,http://www.edamam.com/ontologies/edamam.owl#re...,Pasta Dough,https://www.edamam.com/web-img/bf8/bf8db947c7e...,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,http://www.edamam.com/recipe/pasta-dough-585c8...,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],"['2 cups ""00"" flour, plus more for kneading', ...","[{'text': '2 cups ""00"" flour, plus more for kn...",2124.14,603.5,60.0,['Italian'],['pasta'],['dinner'],476.235,47.6235,202.981,67.660333,3689.0,1229.666667,2124.14,106.207,49.976435,,16.746705,,34.72498,173.6249,106.186,163.363077,,,12.2826,68.236667,6.75,27.0,561.4,140.35,638.235,13.579468,72.0,17.142857,168.47,7.019583,3.2066,20.04125,1596.0,228.0,79.749,159.498,1.8952,145.784615,2.579,,0.8984,74.866667,1295.4,143.933333,1.3,100.0,6.63,276.25,,,18.36,122.4,10.85925,72.395,11.257,9.380833
8,http://www.edamam.com/ontologies/edamam.owl#re...,Classic Negroni Cocktail Recipe,https://www.edamam.com/web-img/c19/c19d37fdaf7...,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,http://www.edamam.com/recipe/classic-negroni-c...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['1 oz sweet vermouth', '1 oz gin', '1 oz camp...","[{'text': '1 oz sweet vermouth', 'quantity': 1...",163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,,,163.576748,8.178837,,,,,,,,,,,0.116233,0.645739,,,0.283495,0.070874,28.633018,0.609213,3.118448,0.742488,2.267962,0.094498,0.050746,0.31716,6.803886,0.971984,0.019845,0.039689,0.007654,0.588798,0.223961,,0.003118,0.259871,,,0.015592,1.199403,,,,,,,,,,
13,http://www.edamam.com/ontologies/edamam.owl#re...,Simple Fresh Pasta,https://www.edamam.com/web-img/947/9476a940066...,Food52,http://food52.com/recipes/27825-simple-fresh-p...,http://www.edamam.com/recipe/simple-fresh-past...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['3 large eggs', '300 grams plain flour', '1 t...","[{'text': '3 large eggs', 'quantity': 3.0, 'me...",1306.5,452.134955,0.0,['Italian'],['pasta'],['dinner'],129.512389,12.951239,230.01,76.67,558.0,186.0,1306.5,65.325,5.748,,4.1055,,5.154,25.77,17.205,26.469231,0.057,,6.142045,34.122474,8.1,32.4,148.5,37.125,528.170796,11.237677,84.02135,20.005083,1046.466,43.60275,3.8625,24.140625,621.0,88.714286,49.83,99.66,0.8055,61.961538,1.365,,0.42,35.0,240.0,26.666667,0.387,29.769231,1.335,55.625,,,3.0,20.0,1.755,11.7,1.35,1.125
19,http://www.edamam.com/ontologies/edamam.owl#re...,Egg Noodle,https://www.edamam.com/web-img/de3/de3ac901e9b...,Epicurious,http://www.epicurious.com/recipes/food/views/E...,http://www.edamam.com/recipe/egg-noodle-59c705...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 cups all-purpose flour', '1 cup semolina f...","[{'text': '2 cups all-purpose flour', 'quantit...",1807.630667,559.808863,0.0,['Italian'],['noodle'],['dinner'],184.244127,18.424413,314.954113,104.984704,904.373333,301.457778,1807.630667,90.381533,9.893329,,5.658615,,8.471165,42.355827,26.678313,41.043559,0.03268,,16.639369,92.44094,13.263,53.052,966.805333,241.701333,755.750042,16.079788,146.532755,34.888751,1294.982296,53.957596,20.365128,127.28205,877.48,125.354286,66.345453,132.690907,2.046036,157.387385,1.294853,,2.559305,213.275444,342.832,38.092444,0.616743,47.441795,1.8158,75.658333,,,4.6288,30.858667,2.86776,19.1184,1.385067,1.154222


In [31]:
df_recs.shape

(22941, 74)

In [32]:
df_recs['source'].value_counts()

Allrecipes           3186
food.com             2832
Food Network         1911
Group Recipes        1694
recipezaar.com       1334
Taste of Home        1330
Martha Stewart       1324
BigOven              1313
My Recipes           1291
Delish               1268
Foodista             1152
Food52                786
Epicurious            543
Williams-Sonoma       511
Serious Eats          480
Cookstr               447
Good Housekeeping     405
Kraft Foods           386
Saveur                376
Kitchen Daily         372
Name: source, dtype: int64

In [33]:
df_recs.reset_index(drop=True, inplace=True)

In [34]:
df_recs.head()

Unnamed: 0,uri,title,image,source,url,share_as,yield,diet_labels,health_labels,cautions,list_ingredients_lines,list_ingredients,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,monounsat_fat_pct,polyunsat_fat_g,polyunsat_fat_pct,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,trans_fat_pct,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,sugar_pct,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct
0,http://www.edamam.com/ontologies/edamam.owl#re...,Martini Recipe,https://www.edamam.com/web-img/b06/b06df6af92e...,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,http://www.edamam.com/recipe/martini-recipe-9a...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 ounces dry gin', '1 ounce dry vermouth', '...","[{'text': '2 ounces dry gin', 'quantity': 2.0,...",174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,,,174.357137,8.717857,,,,,,,,,,,0.105125,0.584027,,,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,,0.001452,0.121019,,,0.015315,1.178041,,,,,,,,,,
1,http://www.edamam.com/ontologies/edamam.owl#re...,Pasta Dough,https://www.edamam.com/web-img/bf8/bf8db947c7e...,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,http://www.edamam.com/recipe/pasta-dough-585c8...,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],"['2 cups ""00"" flour, plus more for kneading', ...","[{'text': '2 cups ""00"" flour, plus more for kn...",2124.14,603.5,60.0,['Italian'],['pasta'],['dinner'],476.235,47.6235,202.981,67.660333,3689.0,1229.666667,2124.14,106.207,49.976435,,16.746705,,34.72498,173.6249,106.186,163.363077,,,12.2826,68.236667,6.75,27.0,561.4,140.35,638.235,13.579468,72.0,17.142857,168.47,7.019583,3.2066,20.04125,1596.0,228.0,79.749,159.498,1.8952,145.784615,2.579,,0.8984,74.866667,1295.4,143.933333,1.3,100.0,6.63,276.25,,,18.36,122.4,10.85925,72.395,11.257,9.380833
2,http://www.edamam.com/ontologies/edamam.owl#re...,Classic Negroni Cocktail Recipe,https://www.edamam.com/web-img/c19/c19d37fdaf7...,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,http://www.edamam.com/recipe/classic-negroni-c...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['1 oz sweet vermouth', '1 oz gin', '1 oz camp...","[{'text': '1 oz sweet vermouth', 'quantity': 1...",163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,,,163.576748,8.178837,,,,,,,,,,,0.116233,0.645739,,,0.283495,0.070874,28.633018,0.609213,3.118448,0.742488,2.267962,0.094498,0.050746,0.31716,6.803886,0.971984,0.019845,0.039689,0.007654,0.588798,0.223961,,0.003118,0.259871,,,0.015592,1.199403,,,,,,,,,,
3,http://www.edamam.com/ontologies/edamam.owl#re...,Simple Fresh Pasta,https://www.edamam.com/web-img/947/9476a940066...,Food52,http://food52.com/recipes/27825-simple-fresh-p...,http://www.edamam.com/recipe/simple-fresh-past...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['3 large eggs', '300 grams plain flour', '1 t...","[{'text': '3 large eggs', 'quantity': 3.0, 'me...",1306.5,452.134955,0.0,['Italian'],['pasta'],['dinner'],129.512389,12.951239,230.01,76.67,558.0,186.0,1306.5,65.325,5.748,,4.1055,,5.154,25.77,17.205,26.469231,0.057,,6.142045,34.122474,8.1,32.4,148.5,37.125,528.170796,11.237677,84.02135,20.005083,1046.466,43.60275,3.8625,24.140625,621.0,88.714286,49.83,99.66,0.8055,61.961538,1.365,,0.42,35.0,240.0,26.666667,0.387,29.769231,1.335,55.625,,,3.0,20.0,1.755,11.7,1.35,1.125
4,http://www.edamam.com/ontologies/edamam.owl#re...,Egg Noodle,https://www.edamam.com/web-img/de3/de3ac901e9b...,Epicurious,http://www.epicurious.com/recipes/food/views/E...,http://www.edamam.com/recipe/egg-noodle-59c705...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 cups all-purpose flour', '1 cup semolina f...","[{'text': '2 cups all-purpose flour', 'quantit...",1807.630667,559.808863,0.0,['Italian'],['noodle'],['dinner'],184.244127,18.424413,314.954113,104.984704,904.373333,301.457778,1807.630667,90.381533,9.893329,,5.658615,,8.471165,42.355827,26.678313,41.043559,0.03268,,16.639369,92.44094,13.263,53.052,966.805333,241.701333,755.750042,16.079788,146.532755,34.888751,1294.982296,53.957596,20.365128,127.28205,877.48,125.354286,66.345453,132.690907,2.046036,157.387385,1.294853,,2.559305,213.275444,342.832,38.092444,0.616743,47.441795,1.8158,75.658333,,,4.6288,30.858667,2.86776,19.1184,1.385067,1.154222


In [35]:
#Looking at information about df_recs:
df_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22941 entries, 0 to 22940
Data columns (total 74 columns):
uri                       22941 non-null object
title                     22941 non-null object
image                     22941 non-null object
source                    22941 non-null object
url                       22941 non-null object
share_as                  22941 non-null object
yield                     22941 non-null float64
diet_labels               22941 non-null object
health_labels             22941 non-null object
cautions                  22941 non-null object
list_ingredients_lines    22941 non-null object
list_ingredients          22941 non-null object
calories                  22941 non-null float64
total_weight              22941 non-null float64
total_time                22941 non-null float64
cuisine_type              22941 non-null object
meal_type                 22941 non-null object
dish_type                 22941 non-null object
calcium_mg             

In [37]:
#Looking at the first 5 recipes' titles:
df_recs['title'].head()

0                     Martini Recipe
1                        Pasta Dough
2    Classic Negroni Cocktail Recipe
3                 Simple Fresh Pasta
4                         Egg Noodle
Name: title, dtype: object

In [38]:
#Looking at the list_ingredients_lines for the first recipe in df_recs:
df_recs['list_ingredients_lines'].iloc[0]

"['2 ounces dry gin', '1 ounce dry vermouth', '1 dash orange bitters (optional, but highly recommended)']"

In [39]:
#Looking at the list_ingredients for the first recipe in df_recs:
df_recs['list_ingredients'].iloc[0]

"[{'text': '2 ounces dry gin', 'quantity': 2.0, 'measure': 'ounce', 'food': 'gin', 'weight': 56.69904625, 'foodCategory': 'liquors and cocktails', 'foodId': 'food_b7mc1axbki31lyavyfpewa08ec3m'}, {'text': '1 ounce dry vermouth', 'quantity': 1.0, 'measure': 'ounce', 'food': 'dry vermouth', 'weight': 28.349523125, 'foodCategory': 'wines', 'foodId': 'food_bysjialanqk7toayuhcdkbu46vks'}, {'text': '1 dash orange bitters (optional, but highly recommended)', 'quantity': 1.0, 'measure': 'dash', 'food': 'orange bitters', 'weight': 0.5791666664610352, 'foodCategory': 'liquors and cocktails', 'foodId': 'food_a8hiis0bu96cjga87113lbdwhr46'}]"

In [40]:
#Looking at the title for the 100th recipes in df_recs:
df_recs['title'].iloc[100]

'The Best Chicken Parmesan Recipe'

In [41]:
#Looking at the list_ingredients_lines for the 100th recipe in df_recs:
df_recs['list_ingredients_lines'].iloc[100]

"['3 boneless skinless chicken breast halves', '1 3/4 cups buttermilk, divided', '2 medium cloves garlic, minced (about 2 teaspoons)', 'Kosher salt and freshly ground black pepper', '1/2 loaf crusty italian bread, crust removed, sliced into 1/2-inch slices', '5 ounces grated Parmesan cheese, plus more for serving', '1 1/2 cups all-purpose flour', '2 large eggs', '1/2 cup vegetable or canola oil', '1 quart Italian-American red sauce (see note above)', '10 ounces fresh mozzarella cheese, cut into 1/2-inch chunks', '2 tablespoons minced fresh parsley, basil, or a mix']"

In [42]:
#Looking at the list_ingredients for the 100th recipe in df_recs:
df_recs['list_ingredients'].iloc[100]

"[{'text': '3 boneless skinless chicken breast halves', 'quantity': 3.0, 'measure': 'half', 'food': 'boneless skinless chicken breast', 'weight': 261.0, 'foodCategory': 'Poultry', 'foodId': 'food_bwbli78brehlfjaqv25j3abzlo6b'}, {'text': '1 3/4 cups buttermilk, divided', 'quantity': 1.75, 'measure': 'cup', 'food': 'buttermilk', 'weight': 428.75, 'foodCategory': 'Milk', 'foodId': 'food_aq58hokb0jwwsja45z2a5bix3gm0'}, {'text': '2 medium cloves garlic, minced (about 2 teaspoons)', 'quantity': 2.0, 'measure': 'teaspoon', 'food': 'garlic', 'weight': 5.6, 'foodCategory': 'vegetables', 'foodId': 'food_b5bagehah9jusvamzk4tfapqk0tm'}, {'text': 'Kosher salt and freshly ground black pepper', 'quantity': 0.0, 'measure': None, 'food': 'Kosher salt', 'weight': 10.09435708125, 'foodCategory': 'Condiments and sauces', 'foodId': 'food_bhb8s26b4myb4wbb3yvlebmtp7ia'}, {'text': 'Kosher salt and freshly ground black pepper', 'quantity': 0.0, 'measure': None, 'food': 'black pepper', 'weight': 5.047178540625,

In [43]:
#Looking at the number of ingredients in the 100th recipe in df_recs:
len(eval(df_recs['list_ingredients'].iloc[100]))

12

In [44]:
#Printing the food value for each ingredient in the 100th recipes in df_recs:
for i in range(len(eval(df_recs['list_ingredients'].iloc[100]))):
    print(eval(df_recs['list_ingredients'].iloc[100])[i]['food'])

boneless skinless chicken breast
buttermilk
garlic
Kosher salt
black pepper
italian bread
Parmesan cheese
all-purpose flour
eggs
canola oil
mozzarella cheese
basil


In [45]:
#Printing the food category value for each ingredient in the 100th recipes in df_recs:
for i in range(len(eval(df_recs['list_ingredients'].iloc[100]))):
    print(eval(df_recs['list_ingredients'].iloc[100])[i]['foodCategory'])

Poultry
Milk
vegetables
Condiments and sauces
Condiments and sauces
bread, rolls and tortillas
Cheese
grains
Eggs
Oils
Cheese
Condiments and sauces


The list_ingredients column has a good amount of information, and the food and foodCategory information is helpful to identify which ingredients are in the recipe.  This information will be taken out of each recipe's list_ingredients dictionary and will be placed in new columns called ingredients and ingredient_categories.

In [46]:
eval(df_recs['list_ingredients'].iloc[0])[0]['foodCategory']

'liquors and cocktails'

In [47]:
eval(df_recs['list_ingredients'].iloc[1])

[{'text': '2 cups "00" flour, plus more for kneading',
  'quantity': 2.0,
  'measure': 'cup',
  'food': 'flour',
  'weight': 250.0,
  'foodCategory': 'grains',
  'foodId': 'food_b0qr35wbg1qs8xbotedt2aux346r'},
 {'text': '20 large egg yolks',
  'quantity': 20.0,
  'measure': '<unit>',
  'food': 'egg yolks',
  'weight': 340.0,
  'foodCategory': 'Eggs',
  'foodId': 'food_a30uym7bg0m0webtbqbhgb06ujca'},
 {'text': '1 tablespoon olive oil',
  'quantity': 1.0,
  'measure': 'tablespoon',
  'food': 'olive oil',
  'weight': 13.5,
  'foodCategory': 'Oils',
  'foodId': 'food_ak2pwnsaa1e2ikb394af7b011fz5'}]

In [48]:
#Instantiating lists that will be filled below:
ingredients_i = []
ingredient_categories_i = []
ingredients_list = []
ingredient_categories_list = []

#Looping through each recipe in df_recs:
for i in range(len(df_recs['list_ingredients'])):
    #Creating recipe i's ingredints_i list: 
    ingredients_i = [ingredient_dict['food'] for ingredient_dict in eval(df_recs['list_ingredients'].iloc[i])]  
    #Creating recipe_i's ingredient_categories_i list:
    ingredient_categories_i = [ingredient_dict['foodCategory'] for ingredient_dict in eval(df_recs['list_ingredients'].iloc[i])]  
    
    #Creating a list of ingredients lists and a list of ingredient categories lists which will be added
    #as columns to the df_recs dataframe    
    ingredients_list.append(ingredients_i)
    ingredient_categories_list.append(ingredient_categories_i)

In [49]:
#Checking there are enough lists of ingredients lists to add ingredients_list to the df_recs dataframe: 
len(ingredients_list)

22941

In [50]:
#Checking there are enough lists of ingredient categories lists to add ingredient_categories_list to 
#the df_recs dataframe:
len(ingredient_categories_list)

22941

In [51]:
ingredients_list[0:5]

[['gin', 'dry vermouth', 'orange bitters'],
 ['flour', 'egg yolks', 'olive oil'],
 ['sweet vermouth', 'gin', 'campari'],
 ['eggs', 'flour', 'salt'],
 ['all-purpose flour', 'semolina flour', 'salt', 'eggs', 'egg yolks']]

In [52]:
ingredient_categories_list[0:5]

[['liquors and cocktails', 'wines', 'liquors and cocktails'],
 ['grains', 'Eggs', 'Oils'],
 ['wines', 'liquors and cocktails', 'liquors and cocktails'],
 ['Eggs', 'grains', 'Condiments and sauces'],
 ['grains', 'grains', 'Condiments and sauces', 'Eggs', 'Eggs']]

In [53]:
#Creating an ingredients column in df_recs and filling it with list_ingredients:
df_recs['ingredients'] = ingredients_list

In [54]:
#Creating an ingredient_categories column in df_recs and filling it with list_ingredient_categories:
df_recs['ingredient_categories'] = ingredient_categories_list

In [55]:
#Checking work:
df_recs.head()

Unnamed: 0,uri,title,image,source,url,share_as,yield,diet_labels,health_labels,cautions,list_ingredients_lines,list_ingredients,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,monounsat_fat_pct,polyunsat_fat_g,polyunsat_fat_pct,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,trans_fat_pct,iron_mg,...,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,sugar_pct,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct,ingredients,ingredient_categories
0,http://www.edamam.com/ontologies/edamam.owl#re...,Martini Recipe,https://www.edamam.com/web-img/b06/b06df6af92e...,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,http://www.edamam.com/recipe/martini-recipe-9a...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 ounces dry gin', '1 ounce dry vermouth', '...","[{'text': '2 ounces dry gin', 'quantity': 2.0,...",174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,,,174.357137,8.717857,,,,,,,,,,,0.105125,...,,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,,0.001452,0.121019,,,0.015315,1.178041,,,,,,,,,,,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,http://www.edamam.com/ontologies/edamam.owl#re...,Pasta Dough,https://www.edamam.com/web-img/bf8/bf8db947c7e...,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,http://www.edamam.com/recipe/pasta-dough-585c8...,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],"['2 cups ""00"" flour, plus more for kneading', ...","[{'text': '2 cups ""00"" flour, plus more for kn...",2124.14,603.5,60.0,['Italian'],['pasta'],['dinner'],476.235,47.6235,202.981,67.660333,3689.0,1229.666667,2124.14,106.207,49.976435,,16.746705,,34.72498,173.6249,106.186,163.363077,,,12.2826,...,27.0,561.4,140.35,638.235,13.579468,72.0,17.142857,168.47,7.019583,3.2066,20.04125,1596.0,228.0,79.749,159.498,1.8952,145.784615,2.579,,0.8984,74.866667,1295.4,143.933333,1.3,100.0,6.63,276.25,,,18.36,122.4,10.85925,72.395,11.257,9.380833,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,http://www.edamam.com/ontologies/edamam.owl#re...,Classic Negroni Cocktail Recipe,https://www.edamam.com/web-img/c19/c19d37fdaf7...,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,http://www.edamam.com/recipe/classic-negroni-c...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['1 oz sweet vermouth', '1 oz gin', '1 oz camp...","[{'text': '1 oz sweet vermouth', 'quantity': 1...",163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,,,163.576748,8.178837,,,,,,,,,,,0.116233,...,,0.283495,0.070874,28.633018,0.609213,3.118448,0.742488,2.267962,0.094498,0.050746,0.31716,6.803886,0.971984,0.019845,0.039689,0.007654,0.588798,0.223961,,0.003118,0.259871,,,0.015592,1.199403,,,,,,,,,,,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,http://www.edamam.com/ontologies/edamam.owl#re...,Simple Fresh Pasta,https://www.edamam.com/web-img/947/9476a940066...,Food52,http://food52.com/recipes/27825-simple-fresh-p...,http://www.edamam.com/recipe/simple-fresh-past...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['3 large eggs', '300 grams plain flour', '1 t...","[{'text': '3 large eggs', 'quantity': 3.0, 'me...",1306.5,452.134955,0.0,['Italian'],['pasta'],['dinner'],129.512389,12.951239,230.01,76.67,558.0,186.0,1306.5,65.325,5.748,,4.1055,,5.154,25.77,17.205,26.469231,0.057,,6.142045,...,32.4,148.5,37.125,528.170796,11.237677,84.02135,20.005083,1046.466,43.60275,3.8625,24.140625,621.0,88.714286,49.83,99.66,0.8055,61.961538,1.365,,0.42,35.0,240.0,26.666667,0.387,29.769231,1.335,55.625,,,3.0,20.0,1.755,11.7,1.35,1.125,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,http://www.edamam.com/ontologies/edamam.owl#re...,Egg Noodle,https://www.edamam.com/web-img/de3/de3ac901e9b...,Epicurious,http://www.epicurious.com/recipes/food/views/E...,http://www.edamam.com/recipe/egg-noodle-59c705...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],"['2 cups all-purpose flour', '1 cup semolina f...","[{'text': '2 cups all-purpose flour', 'quantit...",1807.630667,559.808863,0.0,['Italian'],['noodle'],['dinner'],184.244127,18.424413,314.954113,104.984704,904.373333,301.457778,1807.630667,90.381533,9.893329,,5.658615,,8.471165,42.355827,26.678313,41.043559,0.03268,,16.639369,...,53.052,966.805333,241.701333,755.750042,16.079788,146.532755,34.888751,1294.982296,53.957596,20.365128,127.28205,877.48,125.354286,66.345453,132.690907,2.046036,157.387385,1.294853,,2.559305,213.275444,342.832,38.092444,0.616743,47.441795,1.8158,75.658333,,,4.6288,30.858667,2.86776,19.1184,1.385067,1.154222,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


Spot checking whether the new ingredients and ingredient_categories columns contain most of the information in list_ingredients_lines and list_ingredients:

In [56]:
df_recs['title'].iloc[400]

'Eggplant Caponata'

In [57]:
df_recs['list_ingredients_lines'].iloc[400]

"['3 cups olive oil', '2 lbs. eggplant, cut into 1″ cubes', '1 large yellow onion, chopped', '1 rib celery, roughly chopped', 'Kosher salt and freshly ground black pepper, to taste', '3 tbsp. tomato paste, thinned with 1/4 cup water', '1 cup crushed canned tomatoes', '6 oz. green olives, pitted and roughly chopped', '1/2 cup white wine vinegar', '1/2 cup golden raisins', '1/4 cup salt-packed capers, rinsed and drained', '3 tbsp. sugar', '2 tbsp. finely grated unsweetened chocolate', '1/2 cup finely shredded basil', '2 tbsp. pine nuts']"

In [58]:
df_recs['list_ingredients'].iloc[400]

"[{'text': '3 cups olive oil', 'quantity': 3.0, 'measure': 'cup', 'food': 'olive oil', 'weight': 648.0, 'foodCategory': 'Oils', 'foodId': 'food_ak2pwnsaa1e2ikb394af7b011fz5'}, {'text': '2 lbs. eggplant, cut into 1″ cubes', 'quantity': 2.0, 'measure': 'pound', 'food': 'eggplant', 'weight': 907.18474, 'foodCategory': 'vegetables', 'foodId': 'food_bqqnufbaoa5xcua8zk8arat7q7kp'}, {'text': '1 large yellow onion, chopped', 'quantity': 1.0, 'measure': '<unit>', 'food': 'yellow onion', 'weight': 150.0, 'foodCategory': 'vegetables', 'foodId': 'food_asxxac2a3nlvwza49mkzdbayqird'}, {'text': '1 rib celery, roughly chopped', 'quantity': 1.0, 'measure': 'rib', 'food': 'celery', 'weight': 40.0, 'foodCategory': 'vegetables', 'foodId': 'food_bfz264baccwcmhbhs0dllba76s5d'}, {'text': 'Kosher salt and freshly ground black pepper, to taste', 'quantity': 0.0, 'measure': None, 'food': 'Kosher salt', 'weight': 15.03769127249833, 'foodCategory': 'Condiments and sauces', 'foodId': 'food_bhb8s26b4myb4wbb3yvlebmt

In [59]:
df_recs['ingredients'].iloc[400]

['olive oil',
 'eggplant',
 'yellow onion',
 'celery',
 'Kosher salt',
 'black pepper',
 'tomato paste',
 'canned tomatoes',
 'green olives',
 'white wine vinegar',
 'golden raisins',
 'capers',
 'sugar',
 'unsweetened chocolate',
 'basil',
 'pine nuts']

In [60]:
df_recs['ingredient_categories'].iloc[400]

['Oils',
 'vegetables',
 'vegetables',
 'vegetables',
 'Condiments and sauces',
 'Condiments and sauces',
 'canned vegetables',
 'canned vegetables',
 'canned fruit',
 'Condiments and sauces',
 'fruit',
 'Condiments and sauces',
 'sugars',
 'chocolate',
 'Condiments and sauces',
 'plant-based protein']

It appears that the ingredients and ingredient_categories columns contain most of the information I would like to use from the list_ingredients_lines and list_ingredients columns, so the list_ingredients_lines and list_ingredients columns will be dropped from df_recs soon.

In [61]:
#Saving df_recs before columns will be dropped:
df_recs.to_csv('./data/df_recs_with_nulls_final.csv')

#### Dropping Columns from df_recs:

In [62]:
#Looking at info about df_recs:
df_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22941 entries, 0 to 22940
Data columns (total 76 columns):
uri                       22941 non-null object
title                     22941 non-null object
image                     22941 non-null object
source                    22941 non-null object
url                       22941 non-null object
share_as                  22941 non-null object
yield                     22941 non-null float64
diet_labels               22941 non-null object
health_labels             22941 non-null object
cautions                  22941 non-null object
list_ingredients_lines    22941 non-null object
list_ingredients          22941 non-null object
calories                  22941 non-null float64
total_weight              22941 non-null float64
total_time                22941 non-null float64
cuisine_type              22941 non-null object
meal_type                 22941 non-null object
dish_type                 22941 non-null object
calcium_mg             

In [63]:
#Finding the number of null values in each column of df_recs:
df_recs.isnull().sum()

uri                           0
title                         0
image                         0
source                        0
url                           0
share_as                      0
yield                         0
diet_labels                   0
health_labels                 0
cautions                      0
list_ingredients_lines        0
list_ingredients              0
calories                      0
total_weight                  0
total_time                    0
cuisine_type                  0
meal_type                     0
dish_type                     0
calcium_mg                    0
calcium_pct                   0
carbs_g                      10
carbs_pct                    10
cholesterol_mg             2765
cholesterol_pct            2765
energy_kcal                   0
energy_pct                    0
monounsat_fat_g              51
monounsat_fat_pct         22941
polyunsat_fat_g              51
polyunsat_fat_pct         22941
                          ...  
sodium_m

The following columns are empty and will therefore be removed from df_recs: monounsat_fat_pct, polyunsat_fat_pct, trans_fat_pct, and sugar_pct.  

This makes sense because I found during research for data acquisition that the daily recommended value for unsaturated fat did not include recommended values for monounsaturated fat on its own and polyunsaturated fat on its own.  According to _The Washington Post_, trans fat was officially banned from U.S. restaurants and grocery stores in 2018.  It makes sense that there is no daily recommended value to generate a percent daily value for trans fats since they are officially banned in U.S. restaurants and grocery stores.  

It also makes sense that there are no percent of daily value data for total sugars because there is only a recommended daily value for added sugars available currently.  This is because natural sugars in fruits and vegetables are paired with other nutrients like fiber which can help to reduce blood sugar spikes (as long as the fruit is not dried and therefore concentrated with sugars), so organizations like the World Health Organization and the FDA are more focused on setting maximum recommended values of added sugars rather than natural sugars (and thus total sugars which include a sum of natural and added sugars).

Sources:  

Fats:
- https://medlineplus.gov/ency/patientinstructions/000785.htm
- Caitlin Dewey, https://www.washingtonpost.com/news/wonk/wp/2018/06/18/artificial-trans-fats-widely-linked-to-heart-disease-are-officially-banned/?utm_term=.1655f2600adf  

Sugars:
- https://www.accessdata.fda.gov/scripts/InteractiveNutritionFactsLabel/sugars.html    
- http://sugarscience.ucsf.edu/sugar-faq.html#.XNdZFutKhn4  
- https://universityhealthnews.com/daily/nutrition/high-sugar-content-fruit-damaging-health-waistline/  
- https://www.huffingtonpost.com.au/2017/09/14/how-much-natural-sugar-should-we-eat-a-day_a_23208377/  
- https://www.medicalnewstoday.com/articles/324673.php

In [64]:
#Dropping the following columns: monounsat_fat_pct, polyunsat_fat_pct, trans_fat_pct, sugar_pct:
df_recs.drop(labels = ['monounsat_fat_pct', 'polyunsat_fat_pct', 'trans_fat_pct', 'sugar_pct'],
            axis = 1,
            inplace = True)

In [65]:
#Checking dropped 4 columns, so should have 76 - 4 = 72 columns remaining:
df_recs.shape

(22941, 72)

In [66]:
#Looking at how much memory is used by df_recs:
df_recs.memory_usage().sum()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.memory_usage.html

13214096

After removing four columns filled with null values, df_recs uses approx. 25 MB of memory.

In [67]:
#Looking at url's for recipes and how often they are repeated in df_recs:
#df_recs['url'].value_counts() #this line of code was supressed to reduce length of this notebook

Dropping the uri, image, and share_as columns since they won't be used.  The url provides most of the unique identifying information for each recipe.

In [68]:
#Dropping the following columns: uri, image, and share_as:
df_recs.drop(labels = ['uri', 'image', 'share_as'],
             axis = 1,
             inplace = True)

In [69]:
#Checking dropped 3 columns, so should have 72 - 3 = 69 columns remaining:
df_recs.shape

(22941, 69)

In [70]:
#Looking at how much memory is used by df_recs:
df_recs.memory_usage().sum()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.memory_usage.html

12663512

The dataframe df_recs now uses approx. 24 MB of memory rather than approx. 25 MB of memory.

Removing list_ingredient_lines and list_ingredients since the information we need was pulled from list_ingredients, and these columns are no longer needed.

**Dropping the list_ingredients_lines and list_ingredients columns**

In [71]:
#Dropping the following columns: list_ingredients_lines and list_ingredients:
df_recs.drop(labels = ['list_ingredients_lines', 'list_ingredients'],
             axis = 1,
             inplace = True)

In [72]:
#Checking dropped 2 columns, so should have 69 - 2 = 67 columns remaining:
df_recs.shape

(22941, 67)

In [73]:
#Looking at how much memory is used by df_recs:
df_recs.memory_usage().sum()

12296456

The dataframe df_recs now uses approx. 23 MB of memory rather than approx. 24 MB of memory.

In [74]:
#Looking at info about df_recs:
df_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22941 entries, 0 to 22940
Data columns (total 67 columns):
title                    22941 non-null object
source                   22941 non-null object
url                      22941 non-null object
yield                    22941 non-null float64
diet_labels              22941 non-null object
health_labels            22941 non-null object
cautions                 22941 non-null object
calories                 22941 non-null float64
total_weight             22941 non-null float64
total_time               22941 non-null float64
cuisine_type             22941 non-null object
meal_type                22941 non-null object
dish_type                22941 non-null object
calcium_mg               22941 non-null float64
calcium_pct              22941 non-null float64
carbs_g                  22931 non-null float64
carbs_pct                22931 non-null float64
cholesterol_mg           20176 non-null float64
cholesterol_pct          20176 non-n

In [75]:
#Looking at null values in columns of df_recs:
df_recs.isnull().sum()

title                       0
source                      0
url                         0
yield                       0
diet_labels                 0
health_labels               0
cautions                    0
calories                    0
total_weight                0
total_time                  0
cuisine_type                0
meal_type                   0
dish_type                   0
calcium_mg                  0
calcium_pct                 0
carbs_g                    10
carbs_pct                  10
cholesterol_mg           2765
cholesterol_pct          2765
energy_kcal                 0
energy_pct                  0
monounsat_fat_g            51
polyunsat_fat_g            51
sat_fat_g                  48
sat_fat_pct                48
fat_g                      29
fat_pct                    29
trans_fat_g              4928
iron_mg                     0
iron_pct                    0
                         ... 
magnesium_pct               7
sodium_mg                   0
sodium_pct

**Inspecting object type column null values**

In [76]:
#Looking at first row value for df_recs['meal_type']:
df_recs['meal_type'][0]

"['cocktail']"

In [77]:
#Looking at head of df_recs['meal_type']:
df_recs['meal_type'].head()

0             ['cocktail']
1                ['pasta']
2    ['cocktail', 'drink']
3                ['pasta']
4               ['noodle']
Name: meal_type, dtype: object

In [78]:
#Looking at rows with null values for meal_type:
df_recs[df_recs['meal_type'].isnull()].head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,polyunsat_fat_g,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct,ingredients,ingredient_categories


In [79]:
#Filling null meal_type values with "['NA']" (using same format as the other values in meal_type column
#for ease of future evaluation of the values):
df_recs['meal_type'].fillna("['NA']", inplace = True)

In [80]:
#Looking at null values in columns of df_recs:
df_recs.isnull().sum()

title                       0
source                      0
url                         0
yield                       0
diet_labels                 0
health_labels               0
cautions                    0
calories                    0
total_weight                0
total_time                  0
cuisine_type                0
meal_type                   0
dish_type                   0
calcium_mg                  0
calcium_pct                 0
carbs_g                    10
carbs_pct                  10
cholesterol_mg           2765
cholesterol_pct          2765
energy_kcal                 0
energy_pct                  0
monounsat_fat_g            51
polyunsat_fat_g            51
sat_fat_g                  48
sat_fat_pct                48
fat_g                      29
fat_pct                    29
trans_fat_g              4928
iron_mg                     0
iron_pct                    0
                         ... 
magnesium_pct               7
sodium_mg                   0
sodium_pct

In [81]:
#Looking at info about df_recs:
df_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22941 entries, 0 to 22940
Data columns (total 67 columns):
title                    22941 non-null object
source                   22941 non-null object
url                      22941 non-null object
yield                    22941 non-null float64
diet_labels              22941 non-null object
health_labels            22941 non-null object
cautions                 22941 non-null object
calories                 22941 non-null float64
total_weight             22941 non-null float64
total_time               22941 non-null float64
cuisine_type             22941 non-null object
meal_type                22941 non-null object
dish_type                22941 non-null object
calcium_mg               22941 non-null float64
calcium_pct              22941 non-null float64
carbs_g                  22931 non-null float64
carbs_pct                22931 non-null float64
cholesterol_mg           20176 non-null float64
cholesterol_pct          20176 non-n

Now all object type columns have no null values.  It looks like there are a lot of null values remaining in numerical columns.

In [85]:
df_recs[df_recs['carbs_g'].isnull()].head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,polyunsat_fat_g,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct,ingredients,ingredient_categories
1142,Baked Chicken Breast Recipe,Taste of Home,http://www.tasteofhome.com/Recipes/Baked-Chick...,4.0,['Low-Carb'],"['Sugar-Conscious', 'Low Sugar', 'Keto-Friendl...",[],1600.137753,1143.837208,45.0,['American'],['roast'],['dinner'],101.075829,10.107583,,,580.598234,193.532745,1600.137753,80.006888,37.501877,19.047211,24.462539,122.312695,88.414588,136.022444,0.970319,6.730843,37.393571,,,36.28739,9.071847,1996.234931,42.473084,226.849748,54.011845,2647.514621,110.313109,89.883864,561.77415,1578.501448,225.500207,189.148018,378.296037,0.771107,59.315925,,0.571526,47.627199,217.724338,24.191593,4.808079,369.85224,3.084428,128.517838,,,3.628739,24.191593,3.235099,21.567325,3.2085,2.67375,"[chicken breast, salt, canola oil]","[Poultry, Condiments and sauces, Oils]"
3316,Grilled Steak,Epicurious,http://www.epicurious.com/recipes/food/views/G...,8.0,['Low-Carb'],"['Sugar-Conscious', 'Low Sugar', 'Keto-Friendly']",[],8408.64,3688.0,52.0,['American'],['barbecue'],['dinner'],774.48,77.448,,,2286.56,762.186667,8408.64,420.432,269.66656,26.5536,246.50592,1232.5296,583.8104,898.169846,33.5608,60.4832,336.017778,,,184.4,46.1,8445.52,179.691915,368.8,87.809524,1290.8,53.783333,190.44832,1190.302,6417.12,916.731429,737.6,1475.2,7.1916,553.2,,1.73336,144.446667,147.52,16.391111,20.94784,1611.372308,61.2208,2550.866667,,,3.688,24.586667,9.9576,66.384,59.008,49.173333,[t-bone steaks],[meats]
11519,Turducken,Delish,http://www.delish.com/cooking/recipe-ideas/rec...,8.0,"['High-Protein', 'Low-Carb']","['Sugar-Conscious', 'Low Sugar', 'Keto-Friendl...",[],4099.43,2540.0,0.0,['American'],"['roast', 'side salad']",['dinner'],302.06,30.206,,,1663.33,554.443333,4099.43,204.9715,82.93477,41.86781,50.25736,251.2868,196.5153,302.331231,,38.5491,214.161667,,,220.73,55.1825,6893.59,146.672128,598.14,142.414286,1478.23,61.592917,126.30356,789.39725,4674.18,667.74,545.0161,1090.0322,3.3755,259.653846,,2.34896,195.746667,85.12,9.457778,12.4359,956.607692,11.4012,475.05,15.438,17.153333,,,3.8745,25.83,16.254,13.545,"[turkey breast, duck breast, olive oil]","[Poultry, Poultry, Oils]"
12366,Crock Pot Ribs recipes,Epicurious,http://www.epicurious.com/recipes/food/views/c...,3.0,['Low-Carb'],"['Sugar-Conscious', 'Low Sugar', 'Keto-Friendl...",[],9570.96,5070.0,480.0,['American'],"['barbecue', 'side dish']",['dinner'],1115.52,111.552,,,3747.36,1249.12,9570.96,478.548,139.86768,48.91824,120.0168,600.084,598.5648,920.868923,3.74736,43.0638,239.243333,,,,,16104.0,342.638298,1063.5,253.214286,5515.8,229.825,156.07248,975.453,9773.52,1396.217143,979.3776,1958.7552,12.81192,985.532308,,18.99,1582.5,101.28,11.253333,27.29496,2099.612308,50.64,2110.0,,,35.448,236.32,11.6472,77.648,,,"[country style pork ribs, salt]","[meats, Condiments and sauces]"
12402,Prime Rib Recipe,Group Recipes,http://www.grouprecipes.com/5466/prime-rib.html,6.0,[],"['Dairy-Free', 'Gluten-Free', 'Wheat-Free', 'E...",[],6952.4,2686.111444,120.0,['American'],['roast'],['dinner'],270.306747,27.030675,,,1871.8,623.933333,6952.4,347.62,265.50146,24.62754,247.7461,1238.7305,560.4704,862.262154,35.40376,46.300168,257.223154,,,80.22,20.055,6177.908916,131.444871,481.441114,114.628837,6218.33352,259.09723,111.05122,694.070125,3502.94,500.42,479.1808,958.3616,6.52456,501.889231,,2.24616,187.18,133.7,14.855556,10.61578,816.598462,46.5276,1938.65,,,5.348,35.653333,4.011,26.74,40.11,33.425,"[bone-in rib-eye, salt, garlic powder]","[meats, Condiments and sauces, Condiments and ..."


In [86]:
df_recs[df_recs['cholesterol_mg'].isnull()].head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,polyunsat_fat_g,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct,ingredients,ingredient_categories
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,,,174.357137,8.717857,,,,,,,,0.105125,0.584027,,,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,0.001452,0.121019,,,0.015315,1.178041,,,,,,,,,,,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,,,163.576748,8.178837,,,,,,,,0.116233,0.645739,,,0.283495,0.070874,28.633018,0.609213,3.118448,0.742488,2.267962,0.094498,0.050746,0.31716,6.803886,0.971984,0.019845,0.039689,0.007654,0.588798,0.223961,0.003118,0.259871,,,0.015592,1.199403,,,,,,,,,,,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
5,Pisco Sour Recipe,Serious Eats,http://www.seriouseats.com/recipes/2011/02/tim...,1.0,"['Low-Fat', 'Low-Sodium']","['Low Potassium', 'Kidney-Friendly', 'Vegetari...",['Sulfites'],276.782739,162.060235,2.0,['South American'],['cocktail'],['dinner'],8.581012,0.858101,18.130881,6.043627,,,276.782739,13.839137,0.002268,0.00652,0.002268,0.01134,0.064725,0.099576,,0.846491,4.702728,0.113398,0.453592,3.890952,0.972738,91.317063,1.942916,7.298176,1.737661,57.583519,2.399313,0.100425,0.627655,13.071847,1.867407,2.996668,5.993336,0.136348,10.488281,16.230435,0.040947,3.412257,0.56699,0.062999,0.012953,0.996408,0.02376,0.99,8.504857,9.449841,,,0.062369,0.415793,0.170097,0.141748,"[pisco, lime juice, simple syrup, egg white, b...","[liquors and cocktails, fruit, sugars, Eggs, l..."
7,Cauliflower Popcorn,Food52,https://food52.com/recipes/10620-cauliflower-p...,4.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Keto-Friendly', 'Vegan', ...",[],137.431444,539.023611,39.0,['South American'],['side dish'],['dinner'],118.820111,11.882011,26.921108,8.973703,,,137.431444,6.871572,0.3331,0.234627,0.717786,3.588928,1.748212,2.689557,0.00011,2.36885,13.160278,10.80001,43.200039,306.5725,76.643125,1612.088889,34.299764,81.0815,19.305119,452.248208,18.843675,2.729177,17.057357,237.207764,33.886823,10.34507,20.69014,0.32296,24.843109,10.277842,0.269002,22.416873,,,0.989721,76.132358,,,259.21026,288.0114,,,0.438898,2.925984,83.381797,69.484831,"[cauliflower, pam, salt, turmeric]","[vegetables, Oils, Condiments and sauces, Cond..."
8,Manhattan,Saveur,http://www.saveur.com/article/Recipes/Manhatta...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],251.331528,121.398093,0.0,['American'],"['cocktail', 'drink']",['dinner'],4.967962,0.496796,2.954656,0.984885,,,251.331528,12.566576,0.00245,0.00275,0.00195,0.00975,0.0105,0.016154,,0.144603,0.80335,0.16,0.64,0.283495,0.070874,30.026514,0.638862,3.318448,0.790107,1.647476,0.068645,0.090174,0.563591,8.491362,1.213052,0.030845,0.061689,0.007491,0.576221,2.24751,0.008401,0.700113,0.1,0.011111,0.015589,1.199134,,,,,,,0.0025,0.016667,0.075,0.0625,"[rye whiskey, sweet vermouth, Angostura bitter...","[liquors and cocktails, wines, liquors and coc..."


It looks like recipes with null values for certain nutrients probably don't contain those nutrients, so null values for numerical columns will be filled with 0 values.

In [87]:
df_recs.fillna(0, inplace = True)

In [88]:
df_recs.isnull().sum().sum()

0

Now there are no more null values.

In [89]:
#Saving df_recs in this state (before adding more numerical columns):
df_recs.to_csv('./data/df_recs_without_nulls_final.csv')

#### Creating proportion of daily value columns:

First, the percentages will be divided by 100 to get proportions (currently the percent of daily values columns have the nutritional content of the dish divided by the total daily value recommended for the nutrient which is then multiplied by 100%.  Here we are dividing by 100% to get to the proportion of daily value.).

In [90]:
df_recs.head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,polyunsat_fat_g,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,thiamin_mg,thiamin_pct,vit_A_mcg,vit_A_pct,vit_B6_mg,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct,ingredients,ingredient_categories
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,0.0,0.0,174.357137,8.717857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105125,0.584027,0.0,0.0,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,0.001452,0.121019,0.0,0.0,0.015315,1.178041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],2124.14,603.5,60.0,['Italian'],['pasta'],['dinner'],476.235,47.6235,202.981,67.660333,3689.0,1229.666667,2124.14,106.207,49.976435,16.746705,34.72498,173.6249,106.186,163.363077,0.0,12.2826,68.236667,6.75,27.0,561.4,140.35,638.235,13.579468,72.0,17.142857,168.47,7.019583,3.2066,20.04125,1596.0,228.0,79.749,159.498,1.8952,145.784615,2.579,0.8984,74.866667,1295.4,143.933333,1.3,100.0,6.63,276.25,0.0,0.0,18.36,122.4,10.85925,72.395,11.257,9.380833,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,0.0,0.0,163.576748,8.178837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116233,0.645739,0.0,0.0,0.283495,0.070874,28.633018,0.609213,3.118448,0.742488,2.267962,0.094498,0.050746,0.31716,6.803886,0.971984,0.019845,0.039689,0.007654,0.588798,0.223961,0.003118,0.259871,0.0,0.0,0.015592,1.199403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],1306.5,452.134955,0.0,['Italian'],['pasta'],['dinner'],129.512389,12.951239,230.01,76.67,558.0,186.0,1306.5,65.325,5.748,4.1055,5.154,25.77,17.205,26.469231,0.057,6.142045,34.122474,8.1,32.4,148.5,37.125,528.170796,11.237677,84.02135,20.005083,1046.466,43.60275,3.8625,24.140625,621.0,88.714286,49.83,99.66,0.8055,61.961538,1.365,0.42,35.0,240.0,26.666667,0.387,29.769231,1.335,55.625,0.0,0.0,3.0,20.0,1.755,11.7,1.35,1.125,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],1807.630667,559.808863,0.0,['Italian'],['noodle'],['dinner'],184.244127,18.424413,314.954113,104.984704,904.373333,301.457778,1807.630667,90.381533,9.893329,5.658615,8.471165,42.355827,26.678313,41.043559,0.03268,16.639369,92.44094,13.263,53.052,966.805333,241.701333,755.750042,16.079788,146.532755,34.888751,1294.982296,53.957596,20.365128,127.28205,877.48,125.354286,66.345453,132.690907,2.046036,157.387385,1.294853,2.559305,213.275444,342.832,38.092444,0.616743,47.441795,1.8158,75.658333,0.0,0.0,4.6288,30.858667,2.86776,19.1184,1.385067,1.154222,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [91]:
df_recs.columns

Index(['title', 'source', 'url', 'yield', 'diet_labels', 'health_labels',
       'cautions', 'calories', 'total_weight', 'total_time', 'cuisine_type',
       'meal_type', 'dish_type', 'calcium_mg', 'calcium_pct', 'carbs_g',
       'carbs_pct', 'cholesterol_mg', 'cholesterol_pct', 'energy_kcal',
       'energy_pct', 'monounsat_fat_g', 'polyunsat_fat_g', 'sat_fat_g',
       'sat_fat_pct', 'fat_g', 'fat_pct', 'trans_fat_g', 'iron_mg', 'iron_pct',
       'fiber_g', 'fiber_pct', 'folate_mcg', 'folate_pct', 'potassium_mg',
       'potassium_pct', 'magnesium_mg', 'magnesium_pct', 'sodium_mg',
       'sodium_pct', 'niacin_mg', 'niacin_pct', 'phosphorus_mg',
       'phosphorus_pct', 'protein_g', 'protein_pct', 'riboflavin_mg',
       'riboflavin_pct', 'sugar_g', 'thiamin_mg', 'thiamin_pct', 'vit_A_mcg',
       'vit_A_pct', 'vit_B6_mg', 'vit_B6_pct', 'vit_B12_mcg', 'vit_B12_pct',
       'vit_C_mg', 'vit_C_pct', 'vit_D_mcg', 'vit_D_pct', 'vit_E_mg',
       'vit_E_pct', 'vit_K_mcg', 'vit_K_pct', '

In [92]:
for column in df_recs.columns:
    if 'pct' in column:
        df_recs[f'{column}_div100'] = df_recs[column]/100

In [93]:
df_recs.head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,polyunsat_fat_g,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,...,vit_B6_pct,vit_B12_mcg,vit_B12_pct,vit_C_mg,vit_C_pct,vit_D_mcg,vit_D_pct,vit_E_mg,vit_E_pct,vit_K_mcg,vit_K_pct,ingredients,ingredient_categories,calcium_pct_div100,carbs_pct_div100,cholesterol_pct_div100,energy_pct_div100,sat_fat_pct_div100,fat_pct_div100,iron_pct_div100,fiber_pct_div100,folate_pct_div100,potassium_pct_div100,magnesium_pct_div100,sodium_pct_div100,niacin_pct_div100,phosphorus_pct_div100,protein_pct_div100,riboflavin_pct_div100,thiamin_pct_div100,vit_A_pct_div100,vit_B6_pct_div100,vit_B12_pct_div100,vit_C_pct_div100,vit_D_pct_div100,vit_E_pct_div100,vit_K_pct_div100
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,0.0,0.0,174.357137,8.717857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105125,0.584027,0.0,0.0,0.283495,0.070874,28.077611,0.597396,3.118448,...,1.178041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",0.002268,0.00257,0.0,0.087179,0.0,0.0,0.00584,0.0,0.000709,0.005974,0.007425,0.001066,0.002946,0.008133,0.000397,0.005034,0.00121,0.0,0.01178,0.0,0.0,0.0,0.0,0.0
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],2124.14,603.5,60.0,['Italian'],['pasta'],['dinner'],476.235,47.6235,202.981,67.660333,3689.0,1229.666667,2124.14,106.207,49.976435,16.746705,34.72498,173.6249,106.186,163.363077,0.0,12.2826,68.236667,6.75,27.0,561.4,140.35,638.235,13.579468,72.0,...,100.0,6.63,276.25,0.0,0.0,18.36,122.4,10.85925,72.395,11.257,9.380833,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",0.476235,0.676603,12.296667,1.06207,1.736249,1.633631,0.682367,0.27,1.4035,0.135795,0.171429,0.070196,0.200413,2.28,1.59498,1.457846,0.748667,1.439333,1.0,2.7625,0.0,1.224,0.72395,0.093808
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,0.0,0.0,163.576748,8.178837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116233,0.645739,0.0,0.0,0.283495,0.070874,28.633018,0.609213,3.118448,...,1.199403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",0.002268,0.00257,0.0,0.081788,0.0,0.0,0.006457,0.0,0.000709,0.006092,0.007425,0.000945,0.003172,0.00972,0.000397,0.005888,0.002599,0.0,0.011994,0.0,0.0,0.0,0.0,0.0
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],1306.5,452.134955,0.0,['Italian'],['pasta'],['dinner'],129.512389,12.951239,230.01,76.67,558.0,186.0,1306.5,65.325,5.748,4.1055,5.154,25.77,17.205,26.469231,0.057,6.142045,34.122474,8.1,32.4,148.5,37.125,528.170796,11.237677,84.02135,...,29.769231,1.335,55.625,0.0,0.0,3.0,20.0,1.755,11.7,1.35,1.125,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",0.129512,0.7667,1.86,0.65325,0.2577,0.264692,0.341225,0.324,0.37125,0.112377,0.200051,0.436027,0.241406,0.887143,0.9966,0.619615,0.35,0.266667,0.297692,0.55625,0.0,0.2,0.117,0.01125
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],1807.630667,559.808863,0.0,['Italian'],['noodle'],['dinner'],184.244127,18.424413,314.954113,104.984704,904.373333,301.457778,1807.630667,90.381533,9.893329,5.658615,8.471165,42.355827,26.678313,41.043559,0.03268,16.639369,92.44094,13.263,53.052,966.805333,241.701333,755.750042,16.079788,146.532755,...,47.441795,1.8158,75.658333,0.0,0.0,4.6288,30.858667,2.86776,19.1184,1.385067,1.154222,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",0.184244,1.049847,3.014578,0.903815,0.423558,0.410436,0.924409,0.53052,2.417013,0.160798,0.348888,0.539576,1.27282,1.253543,1.326909,1.573874,2.132754,0.380924,0.474418,0.756583,0.0,0.308587,0.191184,0.011542


A few proportions of daily value were compared with their corresponding percent of daily value.  The new columns appear to be correctly calculated.

In [94]:
#Saving df_recs in this state (before adding more numerical columns):
df_recs.to_csv('./data/df_recs_without_nulls_with_proportionsDV_final.csv')

#### Changing nutritional info to be per serving in the dish rather than per full dish

Note that during data collection, it was found that nutrient content values were for the full dish, even if there was more than one serving in the dish.  The nutritional content values will be divided by the yield (number of servings). 

In [95]:
df_recs.dtypes.value_counts()

float64    80
object     11
dtype: int64

In [96]:
df_recs.select_dtypes(include = float).columns

Index(['yield', 'calories', 'total_weight', 'total_time', 'calcium_mg',
       'calcium_pct', 'carbs_g', 'carbs_pct', 'cholesterol_mg',
       'cholesterol_pct', 'energy_kcal', 'energy_pct', 'monounsat_fat_g',
       'polyunsat_fat_g', 'sat_fat_g', 'sat_fat_pct', 'fat_g', 'fat_pct',
       'trans_fat_g', 'iron_mg', 'iron_pct', 'fiber_g', 'fiber_pct',
       'folate_mcg', 'folate_pct', 'potassium_mg', 'potassium_pct',
       'magnesium_mg', 'magnesium_pct', 'sodium_mg', 'sodium_pct', 'niacin_mg',
       'niacin_pct', 'phosphorus_mg', 'phosphorus_pct', 'protein_g',
       'protein_pct', 'riboflavin_mg', 'riboflavin_pct', 'sugar_g',
       'thiamin_mg', 'thiamin_pct', 'vit_A_mcg', 'vit_A_pct', 'vit_B6_mg',
       'vit_B6_pct', 'vit_B12_mcg', 'vit_B12_pct', 'vit_C_mg', 'vit_C_pct',
       'vit_D_mcg', 'vit_D_pct', 'vit_E_mg', 'vit_E_pct', 'vit_K_mcg',
       'vit_K_pct', 'calcium_pct_div100', 'carbs_pct_div100',
       'cholesterol_pct_div100', 'energy_pct_div100', 'sat_fat_pct_div100',
  

In [97]:
columns_to_div_by_yield = df_recs.select_dtypes(include = float).columns.drop(labels = 'yield')
print(columns_to_div_by_yield)
print(len(columns_to_div_by_yield))

Index(['calories', 'total_weight', 'total_time', 'calcium_mg', 'calcium_pct',
       'carbs_g', 'carbs_pct', 'cholesterol_mg', 'cholesterol_pct',
       'energy_kcal', 'energy_pct', 'monounsat_fat_g', 'polyunsat_fat_g',
       'sat_fat_g', 'sat_fat_pct', 'fat_g', 'fat_pct', 'trans_fat_g',
       'iron_mg', 'iron_pct', 'fiber_g', 'fiber_pct', 'folate_mcg',
       'folate_pct', 'potassium_mg', 'potassium_pct', 'magnesium_mg',
       'magnesium_pct', 'sodium_mg', 'sodium_pct', 'niacin_mg', 'niacin_pct',
       'phosphorus_mg', 'phosphorus_pct', 'protein_g', 'protein_pct',
       'riboflavin_mg', 'riboflavin_pct', 'sugar_g', 'thiamin_mg',
       'thiamin_pct', 'vit_A_mcg', 'vit_A_pct', 'vit_B6_mg', 'vit_B6_pct',
       'vit_B12_mcg', 'vit_B12_pct', 'vit_C_mg', 'vit_C_pct', 'vit_D_mcg',
       'vit_D_pct', 'vit_E_mg', 'vit_E_pct', 'vit_K_mcg', 'vit_K_pct',
       'calcium_pct_div100', 'carbs_pct_div100', 'cholesterol_pct_div100',
       'energy_pct_div100', 'sat_fat_pct_div100', 'fat_pct_di

In [98]:
for column in columns_to_div_by_yield:
    df_recs[f'{column}_per_serv'] = df_recs[column]/df_recs['yield']

In [99]:
#Increasing the number of columns that will be displayed when inspecting the head of a dataframe:
pd.set_option('display.max_columns', 100)
df_recs.head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,calories,total_weight,total_time,cuisine_type,meal_type,dish_type,calcium_mg,calcium_pct,carbs_g,carbs_pct,cholesterol_mg,cholesterol_pct,energy_kcal,energy_pct,monounsat_fat_g,polyunsat_fat_g,sat_fat_g,sat_fat_pct,fat_g,fat_pct,trans_fat_g,iron_mg,iron_pct,fiber_g,fiber_pct,folate_mcg,folate_pct,potassium_mg,potassium_pct,magnesium_mg,magnesium_pct,sodium_mg,sodium_pct,niacin_mg,niacin_pct,phosphorus_mg,phosphorus_pct,protein_g,protein_pct,riboflavin_mg,riboflavin_pct,sugar_g,thiamin_mg,...,sodium_pct_per_serv,niacin_mg_per_serv,niacin_pct_per_serv,phosphorus_mg_per_serv,phosphorus_pct_per_serv,protein_g_per_serv,protein_pct_per_serv,riboflavin_mg_per_serv,riboflavin_pct_per_serv,sugar_g_per_serv,thiamin_mg_per_serv,thiamin_pct_per_serv,vit_A_mcg_per_serv,vit_A_pct_per_serv,vit_B6_mg_per_serv,vit_B6_pct_per_serv,vit_B12_mcg_per_serv,vit_B12_pct_per_serv,vit_C_mg_per_serv,vit_C_pct_per_serv,vit_D_mcg_per_serv,vit_D_pct_per_serv,vit_E_mg_per_serv,vit_E_pct_per_serv,vit_K_mcg_per_serv,vit_K_pct_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],174.357137,85.627736,0.0,['Italian'],['cocktail'],['dinner'],2.267962,0.226796,0.771107,0.257036,0.0,0.0,174.357137,8.717857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105125,0.584027,0.0,0.0,0.283495,0.070874,28.077611,0.597396,3.118448,0.742488,2.557249,0.106552,0.047136,0.294597,5.693071,0.813296,0.019845,0.039689,0.006544,0.503351,0.223961,0.001452,...,0.053276,0.023568,0.147298,2.846536,0.406648,0.009922,0.019845,0.003272,0.251675,0.111981,0.000726,0.060509,0.0,0.0,0.007657,0.589021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],2124.14,603.5,60.0,['Italian'],['pasta'],['dinner'],476.235,47.6235,202.981,67.660333,3689.0,1229.666667,2124.14,106.207,49.976435,16.746705,34.72498,173.6249,106.186,163.363077,0.0,12.2826,68.236667,6.75,27.0,561.4,140.35,638.235,13.579468,72.0,17.142857,168.47,7.019583,3.2066,20.04125,1596.0,228.0,79.749,159.498,1.8952,145.784615,2.579,0.8984,...,1.169931,0.534433,3.340208,266.0,38.0,13.2915,26.583,0.315867,24.297436,0.429833,0.149733,12.477778,215.9,23.988889,0.216667,16.666667,1.105,46.041667,0.0,0.0,3.06,20.4,1.809875,12.065833,1.876167,1.563472,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],163.576748,85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],2.267962,0.226796,0.771107,0.257036,0.0,0.0,163.576748,8.178837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116233,0.645739,0.0,0.0,0.283495,0.070874,28.633018,0.609213,3.118448,0.742488,2.267962,0.094498,0.050746,0.31716,6.803886,0.971984,0.019845,0.039689,0.007654,0.588798,0.223961,0.003118,...,0.047249,0.025373,0.15858,3.401943,0.485992,0.009922,0.019845,0.003827,0.294399,0.111981,0.001559,0.129935,0.0,0.0,0.007796,0.599701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],1306.5,452.134955,0.0,['Italian'],['pasta'],['dinner'],129.512389,12.951239,230.01,76.67,558.0,186.0,1306.5,65.325,5.748,4.1055,5.154,25.77,17.205,26.469231,0.057,6.142045,34.122474,8.1,32.4,148.5,37.125,528.170796,11.237677,84.02135,20.005083,1046.466,43.60275,3.8625,24.140625,621.0,88.714286,49.83,99.66,0.8055,61.961538,1.365,0.42,...,7.267125,0.64375,4.023438,103.5,14.785714,8.305,16.61,0.13425,10.326923,0.2275,0.07,5.833333,40.0,4.444444,0.0645,4.961538,0.2225,9.270833,0.0,0.0,0.5,3.333333,0.2925,1.95,0.225,0.1875,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],1807.630667,559.808863,0.0,['Italian'],['noodle'],['dinner'],184.244127,18.424413,314.954113,104.984704,904.373333,301.457778,1807.630667,90.381533,9.893329,5.658615,8.471165,42.355827,26.678313,41.043559,0.03268,16.639369,92.44094,13.263,53.052,966.805333,241.701333,755.750042,16.079788,146.532755,34.888751,1294.982296,53.957596,20.365128,127.28205,877.48,125.354286,66.345453,132.690907,2.046036,157.387385,1.294853,2.559305,...,8.992933,3.394188,21.213675,146.246667,20.892381,11.057576,22.115151,0.341006,26.231231,0.215809,0.426551,35.545907,57.138667,6.348741,0.102791,7.906966,0.302633,12.609722,0.0,0.0,0.771467,5.143111,0.47796,3.1864,0.230844,0.19237,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924


In [100]:
df_recs[['yield','calcium_pct_div100', 'calcium_pct_div100_per_serv']].head()

Unnamed: 0,yield,calcium_pct_div100,calcium_pct_div100_per_serv
0,2.0,0.002268,0.001134
1,6.0,0.476235,0.079372
2,2.0,0.002268,0.001134
3,6.0,0.129512,0.021585
4,6.0,0.184244,0.030707


After checking nutrient values were divided by their corresponding yeilds (number of servings), it appears that the nutrient proportions per serving were created properly.

In [101]:
#Saving df_recs in this state (before removing redundant numerical columns):
df_recs.to_csv('./data/df_recs_without_nulls_with_proportionsDV_per_serv_final.csv')

#### Removing redundant numerical columns:

In [102]:
columns_to_drop = columns_to_div_by_yield.drop(labels=['total_weight','total_time'])
columns_to_drop

Index(['calories', 'calcium_mg', 'calcium_pct', 'carbs_g', 'carbs_pct',
       'cholesterol_mg', 'cholesterol_pct', 'energy_kcal', 'energy_pct',
       'monounsat_fat_g', 'polyunsat_fat_g', 'sat_fat_g', 'sat_fat_pct',
       'fat_g', 'fat_pct', 'trans_fat_g', 'iron_mg', 'iron_pct', 'fiber_g',
       'fiber_pct', 'folate_mcg', 'folate_pct', 'potassium_mg',
       'potassium_pct', 'magnesium_mg', 'magnesium_pct', 'sodium_mg',
       'sodium_pct', 'niacin_mg', 'niacin_pct', 'phosphorus_mg',
       'phosphorus_pct', 'protein_g', 'protein_pct', 'riboflavin_mg',
       'riboflavin_pct', 'sugar_g', 'thiamin_mg', 'thiamin_pct', 'vit_A_mcg',
       'vit_A_pct', 'vit_B6_mg', 'vit_B6_pct', 'vit_B12_mcg', 'vit_B12_pct',
       'vit_C_mg', 'vit_C_pct', 'vit_D_mcg', 'vit_D_pct', 'vit_E_mg',
       'vit_E_pct', 'vit_K_mcg', 'vit_K_pct', 'calcium_pct_div100',
       'carbs_pct_div100', 'cholesterol_pct_div100', 'energy_pct_div100',
       'sat_fat_pct_div100', 'fat_pct_div100', 'iron_pct_div100',
    

In [103]:
df_recs.drop(labels=columns_to_drop,
             axis = 1,
             inplace = True)

In [104]:
df_cols_inv = df_recs.select_dtypes(include = float).columns.drop(labels = ['yield', 'total_weight', 'total_time'])
df_cols_inv

Index(['calories_per_serv', 'total_weight_per_serv', 'total_time_per_serv',
       'calcium_mg_per_serv', 'calcium_pct_per_serv', 'carbs_g_per_serv',
       'carbs_pct_per_serv', 'cholesterol_mg_per_serv',
       'cholesterol_pct_per_serv', 'energy_kcal_per_serv',
       'energy_pct_per_serv', 'monounsat_fat_g_per_serv',
       'polyunsat_fat_g_per_serv', 'sat_fat_g_per_serv',
       'sat_fat_pct_per_serv', 'fat_g_per_serv', 'fat_pct_per_serv',
       'trans_fat_g_per_serv', 'iron_mg_per_serv', 'iron_pct_per_serv',
       'fiber_g_per_serv', 'fiber_pct_per_serv', 'folate_mcg_per_serv',
       'folate_pct_per_serv', 'potassium_mg_per_serv',
       'potassium_pct_per_serv', 'magnesium_mg_per_serv',
       'magnesium_pct_per_serv', 'sodium_mg_per_serv', 'sodium_pct_per_serv',
       'niacin_mg_per_serv', 'niacin_pct_per_serv', 'phosphorus_mg_per_serv',
       'phosphorus_pct_per_serv', 'protein_g_per_serv', 'protein_pct_per_serv',
       'riboflavin_mg_per_serv', 'riboflavin_pct_per_serv'

In [105]:
cols_rmv = []
for col in df_cols_inv:
    if 'pct_per_serv' in col:
        cols_rmv.append(col)

In [106]:
cols_rmv

['calcium_pct_per_serv',
 'carbs_pct_per_serv',
 'cholesterol_pct_per_serv',
 'energy_pct_per_serv',
 'sat_fat_pct_per_serv',
 'fat_pct_per_serv',
 'iron_pct_per_serv',
 'fiber_pct_per_serv',
 'folate_pct_per_serv',
 'potassium_pct_per_serv',
 'magnesium_pct_per_serv',
 'sodium_pct_per_serv',
 'niacin_pct_per_serv',
 'phosphorus_pct_per_serv',
 'protein_pct_per_serv',
 'riboflavin_pct_per_serv',
 'thiamin_pct_per_serv',
 'vit_A_pct_per_serv',
 'vit_B6_pct_per_serv',
 'vit_B12_pct_per_serv',
 'vit_C_pct_per_serv',
 'vit_D_pct_per_serv',
 'vit_E_pct_per_serv',
 'vit_K_pct_per_serv']

In [107]:
#Removing columns that are pct daily values since the propotions of daily value can represent the same thing:
df_recs.drop(labels=cols_rmv,
             axis = 1,
             inplace = True)

In [108]:
df_recs.columns

Index(['title', 'source', 'url', 'yield', 'diet_labels', 'health_labels',
       'cautions', 'total_weight', 'total_time', 'cuisine_type', 'meal_type',
       'dish_type', 'ingredients', 'ingredient_categories',
       'calories_per_serv', 'total_weight_per_serv', 'total_time_per_serv',
       'calcium_mg_per_serv', 'carbs_g_per_serv', 'cholesterol_mg_per_serv',
       'energy_kcal_per_serv', 'monounsat_fat_g_per_serv',
       'polyunsat_fat_g_per_serv', 'sat_fat_g_per_serv', 'fat_g_per_serv',
       'trans_fat_g_per_serv', 'iron_mg_per_serv', 'fiber_g_per_serv',
       'folate_mcg_per_serv', 'potassium_mg_per_serv', 'magnesium_mg_per_serv',
       'sodium_mg_per_serv', 'niacin_mg_per_serv', 'phosphorus_mg_per_serv',
       'protein_g_per_serv', 'riboflavin_mg_per_serv', 'sugar_g_per_serv',
       'thiamin_mg_per_serv', 'vit_A_mcg_per_serv', 'vit_B6_mg_per_serv',
       'vit_B12_mcg_per_serv', 'vit_C_mg_per_serv', 'vit_D_mcg_per_serv',
       'vit_E_mg_per_serv', 'vit_K_mcg_per_serv',


In [109]:
#Saving df_recs in this state (before removing nutritional content columns that aren't proportions of daily value):
df_recs.to_csv('./data/df_recs_without_nulls_with_proportionsDV_per_serv_and_mass_nutrs_final.csv')

#### Removing masses of nutrients since this information is contained in the proportion of daily value columns

In [110]:
cols_inv = df_recs.select_dtypes(include = float).columns.drop(
           labels = ['yield', 'total_weight', 'total_time', 'calories_per_serv', 
                     'total_weight_per_serv', 'total_time_per_serv',
                     'monounsat_fat_g_per_serv', 'polyunsat_fat_g_per_serv', 
                     'trans_fat_g_per_serv', 'sugar_g_per_serv'])
cols_inv

Index(['calcium_mg_per_serv', 'carbs_g_per_serv', 'cholesterol_mg_per_serv',
       'energy_kcal_per_serv', 'sat_fat_g_per_serv', 'fat_g_per_serv',
       'iron_mg_per_serv', 'fiber_g_per_serv', 'folate_mcg_per_serv',
       'potassium_mg_per_serv', 'magnesium_mg_per_serv', 'sodium_mg_per_serv',
       'niacin_mg_per_serv', 'phosphorus_mg_per_serv', 'protein_g_per_serv',
       'riboflavin_mg_per_serv', 'thiamin_mg_per_serv', 'vit_A_mcg_per_serv',
       'vit_B6_mg_per_serv', 'vit_B12_mcg_per_serv', 'vit_C_mg_per_serv',
       'vit_D_mcg_per_serv', 'vit_E_mg_per_serv', 'vit_K_mcg_per_serv',
       'calcium_pct_div100_per_serv', 'carbs_pct_div100_per_serv',
       'cholesterol_pct_div100_per_serv', 'energy_pct_div100_per_serv',
       'sat_fat_pct_div100_per_serv', 'fat_pct_div100_per_serv',
       'iron_pct_div100_per_serv', 'fiber_pct_div100_per_serv',
       'folate_pct_div100_per_serv', 'potassium_pct_div100_per_serv',
       'magnesium_pct_div100_per_serv', 'sodium_pct_div100_per_s

In [111]:
df_recs[cols_inv].head()

Unnamed: 0,calcium_mg_per_serv,carbs_g_per_serv,cholesterol_mg_per_serv,energy_kcal_per_serv,sat_fat_g_per_serv,fat_g_per_serv,iron_mg_per_serv,fiber_g_per_serv,folate_mcg_per_serv,potassium_mg_per_serv,magnesium_mg_per_serv,sodium_mg_per_serv,niacin_mg_per_serv,phosphorus_mg_per_serv,protein_g_per_serv,riboflavin_mg_per_serv,thiamin_mg_per_serv,vit_A_mcg_per_serv,vit_B6_mg_per_serv,vit_B12_mcg_per_serv,vit_C_mg_per_serv,vit_D_mcg_per_serv,vit_E_mg_per_serv,vit_K_mcg_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv
0,1.133981,0.385554,0.0,87.178569,0.0,0.0,0.052562,0.0,0.141748,14.038806,1.559224,1.278624,0.023568,2.846536,0.009922,0.003272,0.000726,0.0,0.007657,0.0,0.0,0.0,0.0,0.0,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0
1,79.3725,33.830167,614.833333,354.023333,5.787497,17.697667,2.0471,1.125,93.566667,106.3725,12.0,28.078333,0.534433,266.0,13.2915,0.315867,0.149733,215.9,0.216667,1.105,0.0,3.06,1.809875,1.876167,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635
2,1.133981,0.385554,0.0,81.788374,0.0,0.0,0.058117,0.0,0.141748,14.316509,1.559224,1.133981,0.025373,3.401943,0.009922,0.003827,0.001559,0.0,0.007796,0.0,0.0,0.0,0.0,0.0,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0
3,21.585398,38.335,93.0,217.75,0.859,2.8675,1.023674,1.35,24.75,88.028466,14.003558,174.411,0.64375,103.5,8.305,0.13425,0.07,40.0,0.0645,0.2225,0.0,0.5,0.2925,0.225,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875
4,30.707355,52.492352,150.728889,301.271778,1.411861,4.446386,2.773228,2.2105,161.134222,125.95834,24.422126,215.830383,3.394188,146.246667,11.057576,0.341006,0.426551,57.138667,0.102791,0.302633,0.0,0.771467,0.47796,0.230844,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924


The nutrients with content mass per serving values in cols_inv also have proportion of daily value per serving values.  It would be redundant to use both values, so the nutrient content mass per serving will be removed since the proportion of daily value per serving includes more information.

In [112]:
cols_rmv = []
for col in cols_inv:
    if 'div100_per_serv' not in col:
        cols_rmv.append(col)

In [113]:
cols_rmv

['calcium_mg_per_serv',
 'carbs_g_per_serv',
 'cholesterol_mg_per_serv',
 'energy_kcal_per_serv',
 'sat_fat_g_per_serv',
 'fat_g_per_serv',
 'iron_mg_per_serv',
 'fiber_g_per_serv',
 'folate_mcg_per_serv',
 'potassium_mg_per_serv',
 'magnesium_mg_per_serv',
 'sodium_mg_per_serv',
 'niacin_mg_per_serv',
 'phosphorus_mg_per_serv',
 'protein_g_per_serv',
 'riboflavin_mg_per_serv',
 'thiamin_mg_per_serv',
 'vit_A_mcg_per_serv',
 'vit_B6_mg_per_serv',
 'vit_B12_mcg_per_serv',
 'vit_C_mg_per_serv',
 'vit_D_mcg_per_serv',
 'vit_E_mg_per_serv',
 'vit_K_mcg_per_serv']

In [114]:
#Removing nutrient mass per serving columns that have proportion of daily value per serving values represented:
df_recs.drop(labels=cols_rmv,
             axis = 1,
             inplace = True)

In [115]:
df_recs.shape

(22941, 45)

In [116]:
df_recs.columns

Index(['title', 'source', 'url', 'yield', 'diet_labels', 'health_labels',
       'cautions', 'total_weight', 'total_time', 'cuisine_type', 'meal_type',
       'dish_type', 'ingredients', 'ingredient_categories',
       'calories_per_serv', 'total_weight_per_serv', 'total_time_per_serv',
       'monounsat_fat_g_per_serv', 'polyunsat_fat_g_per_serv',
       'trans_fat_g_per_serv', 'sugar_g_per_serv',
       'calcium_pct_div100_per_serv', 'carbs_pct_div100_per_serv',
       'cholesterol_pct_div100_per_serv', 'energy_pct_div100_per_serv',
       'sat_fat_pct_div100_per_serv', 'fat_pct_div100_per_serv',
       'iron_pct_div100_per_serv', 'fiber_pct_div100_per_serv',
       'folate_pct_div100_per_serv', 'potassium_pct_div100_per_serv',
       'magnesium_pct_div100_per_serv', 'sodium_pct_div100_per_serv',
       'niacin_pct_div100_per_serv', 'phosphorus_pct_div100_per_serv',
       'protein_pct_div100_per_serv', 'riboflavin_pct_div100_per_serv',
       'thiamin_pct_div100_per_serv', 'vit_A_pc

In [117]:
df_recs.head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],85.627736,0.0,['Italian'],['cocktail'],['dinner'],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,6.0,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],603.5,60.0,['Italian'],['pasta'],['dinner'],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],85.048569,0.0,['American'],"['cocktail', 'drink']",['dinner'],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],452.134955,0.0,['Italian'],['pasta'],['dinner'],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,6.0,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],559.808863,0.0,['Italian'],['noodle'],['dinner'],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924


In [118]:
#Saving df_recs in this state (before making dummy matrices):
df_recs.to_csv('./data/df_recs_clean_numerical_cols_final.csv')

#### Generating Dummies of Categorical Columns:

In [119]:
#Looking at colemns with object type data:
df_recs.select_dtypes(include = object).head()

#Source: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html

Unnamed: 0,title,source,url,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],['Italian'],['cocktail'],['dinner'],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,"['Balanced', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Vegetari...",[],['Italian'],['pasta'],['dinner'],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,"['Low-Fat', 'Low-Sodium']","['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],['American'],"['cocktail', 'drink']",['dinner'],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],['Italian'],['pasta'],['dinner'],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,['Low-Fat'],"['Sugar-Conscious', 'Low Potassium', 'Kidney-F...",[],['Italian'],['noodle'],['dinner'],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [120]:
#Creating a list of columns will dummy:
cols_will_dum = ['source', 'diet_labels', 'health_labels', 'cautions', 'cuisine_type', 'meal_type',
                 'dish_type', 'ingredients', 'ingredient_categories']

In [121]:
#Taking data out of outer quotes if they are present (this was from the json format the data was acquired from)
for dum in cols_will_dum:
    try:
        df_recs[dum] = df_recs[dum].apply(lambda x: eval(x))
    except:
        pass

In [122]:
#Checking work:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],[Italian],[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[American],"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [123]:
#Creating a dummy matrix of sources: 
#Dropping the first column because it can be gathered that if the source is not any of the other sources,
#it is the source that is dropped.
df_source_dum = pd.get_dummies(df_recs['source'], 
                               prefix = 'source', #adding a prefix to column names to show from source column
                               prefix_sep = '_', #adding prefix separator
                               drop_first = True, #dropping first column to avoid dummy dataframe redundancy
                               dummy_na=True) #making sure a column is formed to show if a source is a null value

#Checking work:
df_source_dum.head()

#Sources: 
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
#https://datascience.stackexchange.com/questions/28353/always-drop-the-first-column-after-performing-one-hot-encoding

Unnamed: 0,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [124]:
#Checking dimensions of df_source_dum:
df_source_dum.shape

(22941, 20)

There are some messy sources like source_143.95.39.55, but I decided to move forward with using the sources dummy matrix in hopes that the odd sources won't affect the recommender too much (especially because the source dummy matrix have over 4000 columns - a lot to sort through!).  I would like to clean up the source dummy matrix in the future though.

In [125]:
#Saving source dummy matrix:
df_source_dum.to_csv('./data/df_source_dum_final.csv')

#### Generating dummies by hand for data that is formatted in a way that the pd.get_dummies function can't read it well

**Generating dummies dataframe for the diet_labels column:**

In [126]:
#Looking at diet_labels data that is embedded in a list:
df_recs['diet_labels'].iloc[0]

['Low-Fat', 'Low-Sodium']

In [127]:
#Unpacking an item embedded in a list:
df_recs['diet_labels'].iloc[0][0]

'Low-Fat'

In [128]:
#Creating a list of all diet_labels that are embedded in lists:
diet_labels_list_total = []
for diet_label in df_recs['diet_labels']:
    diet_labels_list_total.extend(diet_label)

In [129]:
#Looking at how many diet labels are in diet_labels_list_total:
len(diet_labels_list_total)

19908

In [130]:
#Inspecting the first 10 diet_labels in diet_labels_list_total:
diet_labels_list_total[0:10]

['Low-Fat',
 'Low-Sodium',
 'Balanced',
 'Low-Sodium',
 'Low-Fat',
 'Low-Sodium',
 'Low-Fat',
 'Low-Fat',
 'Low-Fat',
 'Low-Sodium']

In [131]:
#Generating a list of unique diet labels:
unique_diet_labels = list(np.unique(diet_labels_list_total))
unique_diet_labels 

#Sources:
#https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
#https://www.geeksforgeeks.org/python-get-unique-values-list/
#https://stackoverflow.com/questions/29459008/how-to-remove-brackets-from-python-string
#https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python

['Balanced', 'High-Fiber', 'High-Protein', 'Low-Carb', 'Low-Fat', 'Low-Sodium']

There are only six unique diet labels.

In [132]:
#Creating diet label dummie column names which will become headings for the diet labels dummy dataframe:
diet_label_dummies_cols = []
for diet_label in unique_diet_labels:
    diet_label_dummies_cols.append(f'diet_labels_{diet_label}') 
diet_label_dummies_cols

['diet_labels_Balanced',
 'diet_labels_High-Fiber',
 'diet_labels_High-Protein',
 'diet_labels_Low-Carb',
 'diet_labels_Low-Fat',
 'diet_labels_Low-Sodium']

In [133]:
#Instantiating the diet labels dummy dataframe:
df_diet_labels_dum = pd.DataFrame(columns=diet_label_dummies_cols)
df_diet_labels_dum

Unnamed: 0,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium


In [134]:
#Adding each row's embedded dummy labels to df_diet_labels_dum 
#(0 if not present in the embedded diet_labels list, 1 if present in the embedded diet_labels list):
for i in range(len(df_recs['diet_labels'])):
    #Instantiating a list of values for the row:
    row = []
    #Looping through each unique diet label, and placing a 1 if the diet label embedded in that row of
    #df_recs['diet_labels'] and 0 if not: 
    for diet_label in unique_diet_labels: 
        if diet_label in df_recs['diet_labels'].iloc[i]:
            row.append(1)
        else:
            row.append(0)
    #Adding the row of dummy values to df_diet_labels_dum at the corresponding location:
    df_diet_labels_dum.loc[i] = row
            

In [135]:
df_diet_labels_dum.head()

Unnamed: 0,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium
0,0,0,0,0,1,1
1,1,0,0,0,0,1
2,0,0,0,0,1,1
3,0,0,0,0,1,0
4,0,0,0,0,1,0


In [136]:
df_diet_labels_dum.shape

(22941, 6)

The dummy values match the values in df_recs['diet_labels'], so the diet labels dummy dataframe was successfully created.  A dummy column will not be removed from this dataframe because there can be multiple columns with a 1 rather than the case with the sources column (where recipes only came from one source - only one column could be filled with a 1 in that case).

In [137]:
#Saving diet labels dummy matrix:
df_diet_labels_dum.to_csv('./data/df_diet_labels_dum_final.csv')

**Generating dummies dataframe for the health_labels column:**  

In [138]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],[Italian],[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[American],"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [139]:
#Looking at health_labels data that is embedded in a list:
df_recs['health_labels'].iloc[0]

['Sugar-Conscious',
 'Low Potassium',
 'Kidney-Friendly',
 'Keto-Friendly',
 'Vegan',
 'Vegetarian',
 'Pescatarian',
 'Dairy-Free',
 'Gluten-Free',
 'Wheat-Free',
 'Egg-Free',
 'Peanut-Free',
 'Tree-Nut-Free',
 'Soy-Free',
 'Fish-Free',
 'Shellfish-Free',
 'Pork-Free',
 'Red-Meat-Free',
 'Crustacean-Free',
 'Celery-Free',
 'Mustard-Free',
 'Sesame-Free',
 'Lupine-Free',
 'Mollusk-Free',
 'No oil added',
 'Kosher',
 'Alcohol-Cocktail']

In [140]:
#Unpacking an item embedded in a list:
df_recs['health_labels'].iloc[0][0]

'Sugar-Conscious'

In [141]:
#Creating a list of all health_labels that are embedded in lists:
health_labels_list_total = []
for health_label in df_recs['health_labels']:
    health_labels_list_total.extend(health_label)

In [142]:
#Looking at how many health labels are in health_labels_list_total:
len(health_labels_list_total)

397872

In [143]:
#Inspecting the first 10 health_labels in health_labels_list_total:
health_labels_list_total[0:10]

['Sugar-Conscious',
 'Low Potassium',
 'Kidney-Friendly',
 'Keto-Friendly',
 'Vegan',
 'Vegetarian',
 'Pescatarian',
 'Dairy-Free',
 'Gluten-Free',
 'Wheat-Free']

In [144]:
#Generating a list of unique health labels:
unique_health_labels = list(np.unique(health_labels_list_total))
unique_health_labels 

#Sources:
#https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
#https://www.geeksforgeeks.org/python-get-unique-values-list/
#https://stackoverflow.com/questions/29459008/how-to-remove-brackets-from-python-string
#https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python

['Alcohol-Cocktail',
 'Alcohol-Free',
 'Celery-Free',
 'Crustacean-Free',
 'Dairy-Free',
 'Egg-Free',
 'Fish-Free',
 'Gluten-Free',
 'Keto-Friendly',
 'Kidney-Friendly',
 'Kosher',
 'Low Potassium',
 'Low Sugar',
 'Lupine-Free',
 'Mollusk-Free',
 'Mustard-Free',
 'No oil added',
 'Paleo',
 'Peanut-Free',
 'Pescatarian',
 'Pork-Free',
 'Red-Meat-Free',
 'Sesame-Free',
 'Shellfish-Free',
 'Soy-Free',
 'Sugar-Conscious',
 'Tree-Nut-Free',
 'Vegan',
 'Vegetarian',
 'Wheat-Free']

In [145]:
#Finding how many unique health labels there are:
len(unique_health_labels)

30

There are 30 unique health labels.

In [146]:
#Creating health label dummie column names which will become headings for the health labels dummy dataframe:
health_label_dummies_cols = []
for health_label in unique_health_labels:
    health_label_dummies_cols.append(f'health_labels_{health_label}') 
health_label_dummies_cols

['health_labels_Alcohol-Cocktail',
 'health_labels_Alcohol-Free',
 'health_labels_Celery-Free',
 'health_labels_Crustacean-Free',
 'health_labels_Dairy-Free',
 'health_labels_Egg-Free',
 'health_labels_Fish-Free',
 'health_labels_Gluten-Free',
 'health_labels_Keto-Friendly',
 'health_labels_Kidney-Friendly',
 'health_labels_Kosher',
 'health_labels_Low Potassium',
 'health_labels_Low Sugar',
 'health_labels_Lupine-Free',
 'health_labels_Mollusk-Free',
 'health_labels_Mustard-Free',
 'health_labels_No oil added',
 'health_labels_Paleo',
 'health_labels_Peanut-Free',
 'health_labels_Pescatarian',
 'health_labels_Pork-Free',
 'health_labels_Red-Meat-Free',
 'health_labels_Sesame-Free',
 'health_labels_Shellfish-Free',
 'health_labels_Soy-Free',
 'health_labels_Sugar-Conscious',
 'health_labels_Tree-Nut-Free',
 'health_labels_Vegan',
 'health_labels_Vegetarian',
 'health_labels_Wheat-Free']

In [147]:
#Instantiating the health labels dummy dataframe:
df_health_labels_dum = pd.DataFrame(columns=health_label_dummies_cols)
df_health_labels_dum

Unnamed: 0,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free


In [148]:
#Adding each row's embedded dummy labels to df_health_labels_dum 
#(0 if not present in the embedded health_labels list, 1 if present in the embedded health_labels list):
for i in range(len(df_recs['health_labels'])):
    #Instantiating a list of values for the row:
    row = []
    #Looping through each unique health label, and placing a 1 if the health label embedded in that row of
    #df_recs['health_labels'] and 0 if not: 
    for health_label in unique_health_labels: 
        if health_label in df_recs['health_labels'].iloc[i]:
            row.append(1)
        else:
            row.append(0)
    #Adding the row of dummy values to df_health_labels_dum at the corresponding location:
    df_health_labels_dum.loc[i] = row

In [149]:
#Looking at health_labels data that is embedded in a list:
df_recs['health_labels'].iloc[0]

['Sugar-Conscious',
 'Low Potassium',
 'Kidney-Friendly',
 'Keto-Friendly',
 'Vegan',
 'Vegetarian',
 'Pescatarian',
 'Dairy-Free',
 'Gluten-Free',
 'Wheat-Free',
 'Egg-Free',
 'Peanut-Free',
 'Tree-Nut-Free',
 'Soy-Free',
 'Fish-Free',
 'Shellfish-Free',
 'Pork-Free',
 'Red-Meat-Free',
 'Crustacean-Free',
 'Celery-Free',
 'Mustard-Free',
 'Sesame-Free',
 'Lupine-Free',
 'Mollusk-Free',
 'No oil added',
 'Kosher',
 'Alcohol-Cocktail']

In [150]:
#Checking work:
df_health_labels_dum.head()

Unnamed: 0,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free
0,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0
2,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
3,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0
4,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0


In [151]:
df_health_labels_dum.shape

(22941, 30)

The first row of dummy values match the values in the first row df_recs['health_labels'], so the health labels dummy dataframe was successfully created.  A dummy column will not be removed from this dataframe because there can be multiple columns with a 1 rather than the case with the sources column (where recipes only came from one source - only one column could be filled with a 1 and the rest of the columns were filled with 0's in that case).

In [152]:
#Saving health labels dummy matrix:
df_health_labels_dum.to_csv('./data/df_health_labels_dum_final.csv')

**Generating dummies dataframe for the cautions column:**

In [157]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head(10)

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],[Italian],[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[American],"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."
5,Serious Eats,"[Low-Fat, Low-Sodium]","[Low Potassium, Kidney-Friendly, Vegetarian, P...",[Sulfites],[South American],[cocktail],[dinner],"[pisco, lime juice, simple syrup, egg white, b...","[liquors and cocktails, fruit, sugars, Eggs, l..."
6,Martha Stewart,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[pasta],[dinner],"[all-purpose flour, eggs]","[grains, Eggs]"
7,Food52,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Keto-Friendly, Vegan, Vegeta...",[],[South American],[side dish],[dinner],"[cauliflower, pam, salt, turmeric]","[vegetables, Oils, Condiments and sauces, Cond..."
8,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[American],"[cocktail, drink]",[dinner],"[rye whiskey, sweet vermouth, Angostura bitter...","[liquors and cocktails, wines, liquors and coc..."
9,Serious Eats,"[Low-Fat, Low-Sodium]","[Vegan, Vegetarian, Pescatarian, Dairy-Free, G...",[Sulfites],[British],"[cocktail, drink]",[dinner],"[apple cider, dark brown sugar, orange, cloves]","[fruit, sugars, fruit, Condiments and sauces]"


In [158]:
#Looking at cautions data that is embedded in a list:
df_recs['cautions'].iloc[5]

['Sulfites']

In [159]:
#Unpacking an item embedded in a list:
df_recs['cautions'].iloc[5][0]

'Sulfites'

In [160]:
#Creating a list of all cautions that are embedded in lists:
cautions_list_total = []
for cautions in df_recs['cautions']:
    cautions_list_total.extend(cautions)

In [161]:
#Looking at how many cautions are in cautions_list_total:
len(cautions_list_total)

33234

In [162]:
#Inspecting the first 10 cautions in cautions_list_total:
cautions_list_total[0:10]

['Sulfites',
 'Sulfites',
 'Sulfites',
 'Sulfites',
 'Wheat',
 'Sulfites',
 'Sulfites',
 'Sulfites',
 'Sulfites',
 'FODMAP']

In [163]:
#Generating a list of unique caution values:
unique_cautions = list(np.unique(cautions_list_total))
unique_cautions 

#Sources:
#https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
#https://www.geeksforgeeks.org/python-get-unique-values-list/
#https://stackoverflow.com/questions/29459008/how-to-remove-brackets-from-python-string
#https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python

['Eggs',
 'FODMAP',
 'Gluten',
 'Milk',
 'Peanuts',
 'Shellfish',
 'Soy',
 'Sulfites',
 'Tree-Nuts',
 'Wheat']

In [164]:
#Finding how many unique caution values there are:
len(unique_cautions)

10

There are 10 unique caution values.

In [165]:
#Creating cautions dummie column names which will become headings for the cautions dummy dataframe:
caution_dummies_cols = []
for caution in unique_cautions:
    caution_dummies_cols.append(f'cautions_{caution}') 
caution_dummies_cols

['cautions_Eggs',
 'cautions_FODMAP',
 'cautions_Gluten',
 'cautions_Milk',
 'cautions_Peanuts',
 'cautions_Shellfish',
 'cautions_Soy',
 'cautions_Sulfites',
 'cautions_Tree-Nuts',
 'cautions_Wheat']

In [166]:
#Instantiating the cautions dummy dataframe:
df_cautions_dum = pd.DataFrame(columns=caution_dummies_cols)
df_cautions_dum

Unnamed: 0,cautions_Eggs,cautions_FODMAP,cautions_Gluten,cautions_Milk,cautions_Peanuts,cautions_Shellfish,cautions_Soy,cautions_Sulfites,cautions_Tree-Nuts,cautions_Wheat


In [167]:
#Adding each row's embedded dummy labels to df_cautions_dum 
#(0 if not present in the embedded cautions list, 1 if present in the embedded cautions list):
for i in range(len(df_recs['cautions'])):
    #Instantiating a list of values for the row:
    row = []
    #Looping through each unique caution value, and placing a 1 if the caution embedded in that row of
    #df_recs['cautions'] and 0 if not: 
    for caution in unique_cautions: 
        if caution in df_recs['cautions'].iloc[i]:
            row.append(1)
        else:
            row.append(0)
    #Adding the row of dummy values to df_cautions_dum at the corresponding location:
    df_cautions_dum.loc[i] = row

In [168]:
#Checking work:
df_cautions_dum.head()

Unnamed: 0,cautions_Eggs,cautions_FODMAP,cautions_Gluten,cautions_Milk,cautions_Peanuts,cautions_Shellfish,cautions_Soy,cautions_Sulfites,cautions_Tree-Nuts,cautions_Wheat
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0


In [169]:
df_cautions_dum.shape

(22941, 10)

The values in the head of df_cautions_dum match the corresponding values in df_recs['cautions'], so the cautions dummy dataframe was successfully created.  A dummy column will not be removed from this dataframe because there can be multiple columns with a 1 rather than the case with the sources column (where recipes only came from one source - only one column could be filled with a 1 and the rest of the columns were filled with 0's in that case).

In [170]:
#Saving cautions dummy matrix:
df_cautions_dum.to_csv('./data/df_cautions_dum_final.csv')

**Generating dummies dataframe for the cuisine_type column:**

In [171]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],[Italian],[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[American],"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],[Italian],[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [172]:
#Proving that each list in the cuisine_type only contains zero or one cuisine type:
len_cuisine_list = []
for i in range(len(df_recs['cuisine_type'])):
    len_cuisine_list.append(len(df_recs['cuisine_type'].iloc[i]))

In [173]:
#Looking at the first five cuisine_type list lengths:
len_cuisine_list[0:5]

[1, 1, 1, 1, 1]

In [174]:
#Looking at the minimum cuisine_type list length:
min(len_cuisine_list)

1

In [175]:
#Looking at the maximum cuisine_type list length:
max(len_cuisine_list)

1

There is only one cuisine type per row, so the pd.get_dummies fuction is appropriate to use in this situation.  First, the cuisine types have to be removed from their lists.

In [176]:
#Removing cuisine_type values from their lists (only one cuisine_type in each list, so taking out first element 
#from each cuisine_type list):
df_recs['cuisine_type'] = [cuisine[0] for cuisine in df_recs['cuisine_type']]

In [177]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],Italian,[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],American,"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


The cuisine types were successfully removed from their lists.

In [178]:
#Creating a dummy matrix of cuisine types: 
#Dropping the first column because it can be gathered that if the cuisine type is not any of the other cuisine types,
#it is the cuisine type that is dropped.
df_cuisine_type_dum = pd.get_dummies(df_recs['cuisine_type'], 
                               prefix = 'cuisine_type', #adding a prefix to column names to show from source column
                               prefix_sep = '_', #adding prefix separator
                               drop_first = True, #dropping first column to avoid dummy dataframe redundancy
                               dummy_na=True) #making sure a column is formed to show if a source is a null value

#Checking work:
df_cuisine_type_dum.head()

#Sources: 
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
#https://datascience.stackexchange.com/questions/28353/always-drop-the-first-column-after-performing-one-hot-encoding

Unnamed: 0,cuisine_type_.,cuisine_type_4 Points,cuisine_type_African,cuisine_type_Albanian,cuisine_type_Amerian,cuisine_type_American,"cuisine_type_American, Barbecue, Southern","cuisine_type_American, French","cuisine_type_American, Italian","cuisine_type_American, Southern",cuisine_type_American-South,cuisine_type_American|Asian,cuisine_type_American|North American|Cajun/Creole,cuisine_type_Amish,cuisine_type_Andhra,cuisine_type_Appetizer,cuisine_type_Arab,cuisine_type_Argentinian,cuisine_type_Armenian,cuisine_type_Asian,"cuisine_type_Asian, Chinese",cuisine_type_Aussie,cuisine_type_Australian,cuisine_type_Australian/New Zealand,cuisine_type_Austrian,cuisine_type_Baking,cuisine_type_Balochi,cuisine_type_Bangladeshi,cuisine_type_Belarusian,cuisine_type_Belgian,cuisine_type_Belgium,cuisine_type_Brazilian,cuisine_type_Breakfast,cuisine_type_British,cuisine_type_BritishEurope,cuisine_type_Bulgarian,cuisine_type_Cajun,cuisine_type_Cajun/Creole|Southern,cuisine_type_Canadian,cuisine_type_Caribbean,cuisine_type_Central American/Caribbean,cuisine_type_Cheap,cuisine_type_Chechen,cuisine_type_Chinese,cuisine_type_Colombian,cuisine_type_Cookie,cuisine_type_Costa Rican,cuisine_type_Creole,cuisine_type_Cuban,cuisine_type_Cuban|Central American/Caribbean,...,cuisine_type_Portuguese,cuisine_type_Puerto Rican,cuisine_type_Raisine,cuisine_type_Romanian,cuisine_type_Russian,cuisine_type_Sandwiches,cuisine_type_Scandanavian,cuisine_type_Scandinavian,cuisine_type_Scottish,cuisine_type_Seafood,cuisine_type_Sichuan Chinese,cuisine_type_Singaporean,cuisine_type_Slovak,cuisine_type_Slow cooker,cuisine_type_Soup,cuisine_type_South American,cuisine_type_Southern,cuisine_type_Southern American,cuisine_type_Spanish,cuisine_type_Sri Lankan,cuisine_type_Swedish,cuisine_type_Swiss,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Thailand,cuisine_type_Traditional English,cuisine_type_Turkish,cuisine_type_Uncategorized,cuisine_type_Vancouver,cuisine_type_Vegan,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Zanzibari,cuisine_type_american,cuisine_type_british,cuisine_type_californian,cuisine_type_chinese,cuisine_type_comfort food,cuisine_type_dessert,cuisine_type_english,cuisine_type_french,cuisine_type_hawaiian,cuisine_type_italian,cuisine_type_low carb,cuisine_type_mediterranean,cuisine_type_mexican,cuisine_type_pakistani,cuisine_type_soup,cuisine_type_western,cuisine_type_nan
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [179]:
df_cuisine_type_dum.shape

(22941, 179)

The cuisine type categorical data was successfully placed in a dummy matrix.

In [180]:
#Saving cuisine type dummy matrix:
df_cuisine_type_dum.to_csv('./data/df_cuisine_type_dum_final.csv')

**Generating dummies dataframe for the meal_type column:**

In [181]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],Italian,[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],American,"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [182]:
# #Taking meal type data out of outer quotes (this was from the json format the data was acquired from):
# #Initializing the meal_type_list:
# meal_type_list = []

# #Looping through each recipe's meal type, and removing outer strings using eval and appending the meal type to 
# #the meal_type_list where needed and otherwise adding the meal type to meal_type_list:
# for meal_type in df_recs['meal_type']:
#     try:
#         meal_type_list.append(eval(meal_type))
#     except:
#         meal_type_list.append(meal_type)

In [183]:
# #Assigning meal_type values to the meal_type_list:
# df_recs['meal_type'] = meal_type_list

In [184]:
# #Looking at columns working on creating dummy dataframes for:
# df_recs[cols_will_dum].head()

In [185]:
#Checking to see if each list in the meal_type column only contains zero or one meal type:
len_meal_list = []
for i in range(len(df_recs['meal_type'])):
    #if type(df_recs['meal_type'].iloc[i]) == list:
    len_meal_list.append(len(df_recs['meal_type'].iloc[i]))
    #else:
    #    pass

In [186]:
#Looking at the first five meal_type list lengths:
len_meal_list[0:5]

[1, 1, 2, 1, 1]

In [189]:
#Looking at the minimum meal_type list length:
min(len_meal_list)

1

In [190]:
#Looking at the maximum meal_type list length:
max(len_meal_list)

2

In [191]:
#Looking at meal_type data that is embedded in a list:
df_recs['meal_type'].iloc[0]

['cocktail']

In [192]:
#Unpacking an item embedded in a list:
df_recs['meal_type'].iloc[0][0]

'cocktail'

In [193]:
#Creating a list of all meal_types that are embedded in lists:
meal_types_list_total = []
for meal_type in df_recs['meal_type']:
    meal_types_list_total.extend(meal_type)

In [194]:
#Looking at how many meal types are in meal_types_list_total:
len(meal_types_list_total)

31037

In [195]:
#Inspecting the first meal types in meal_types_list_total:
meal_types_list_total[0:10]

['cocktail',
 'pasta',
 'cocktail',
 'drink',
 'pasta',
 'noodle',
 'cocktail',
 'pasta',
 'side dish',
 'cocktail']

In [198]:
#Generating a list of unique meal types:
unique_meal_types = list(np.unique(meal_types_list_total))
unique_meal_types 

#Sources:
#https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
#https://www.geeksforgeeks.org/python-get-unique-values-list/
#https://stackoverflow.com/questions/29459008/how-to-remove-brackets-from-python-string
#https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python

[' snack',
 ' stew',
 ' sweet',
 'Bread',
 'Casseroles',
 'Categories: Appetizers Fruit Nuts',
 'Categories: Dessert Snack Melons',
 'Categories: Frozen desserts Dessert Eggs/dairy',
 'Categories: Frozen desserts Dessert Melons',
 'Categories: Frozen desserts Ice cream Dessert',
 'Categories: Main dish Side dishes Spinach',
 'Categories: Side dishes Spinach Greens',
 'Categories: Yeast breads Breads Italian',
 'Main Dishes',
 'antipasto',
 'barbecue',
 'bean',
 'biscuit',
 'bread',
 'burger',
 'cake',
 'candy',
 'casserole',
 'cocktail',
 'condiment',
 'cookie',
 'cupcake',
 'custard',
 'dip',
 'dressing',
 'drink',
 'food gift',
 'grain',
 'gratin',
 'ice cream',
 'legume',
 'marinade',
 'muffin',
 'nonalcoholic drink',
 'noodle',
 'omlete',
 'packed lunch',
 'pancake',
 'pancakes',
 'pasta',
 'picnic',
 'pie',
 'pizza',
 'preserve',
 'pudding',
 'roast',
 'roll',
 'rub',
 'salad',
 'sandwich',
 'sauce',
 'side',
 'side dish',
 'side salad',
 'snack',
 'sorbet',
 'soup',
 'spread',
 '

In [197]:
len(unique_meal_types)

70

The meal type is pretty messy.  The meal_type column will therefore not be used in future analysis.  It looks like I didn't read in the meal type data very well using queries with Edamam's API.  It is also possible that Edamam's API doesn't have optimized meal_type values since each site's recipes come from have different methods of formatting their recipe descriptions including meal type definitions for the recipes.  In the future, it would be helpful to figure out how to utilize the meal type data.

**Generating dummies dataframe for the dish_type column:**

In [199]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[cocktail],[dinner],"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],Italian,[pasta],[dinner],"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],American,"[cocktail, drink]",[dinner],"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[pasta],[dinner],"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[noodle],[dinner],"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [200]:
#Proving that each list in the cuisine_type only contains zero or one cuisine type:
len_dish_type_list = []
for i in range(len(df_recs['dish_type'])):
    len_dish_type_list.append(len(df_recs['dish_type'].iloc[i]))

In [201]:
#Looking at the minimum dish_type list length:
min(len_dish_type_list)

1

In [202]:
#Looking at the maximum dish_type list length:
max(len_dish_type_list)

1

There is only one dish type per row, so the pd.get_dummies fuction is appropriate to use in this situation.  First, the dish types have to be removed from their lists.

In [203]:
#Removing dish_type values from their lists (only one dish_type in each list, so taking out first element 
#from each dish_type list):
df_recs['dish_type'] = [dish_type[0] for dish_type in df_recs['dish_type']]

In [204]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


The dish types were successfully removed from their lists.

In [205]:
#Creating a dummy matrix of dish types: 
#Dropping the first column because it can be gathered that if the dish type is not any of the other dish types,
#it is the dish type that is dropped.
df_dish_type_dum = pd.get_dummies(df_recs['dish_type'], 
                               prefix = 'dish_type', #adding a prefix to column names to show from source column
                               prefix_sep = '_', #adding prefix separator
                               drop_first = True, #dropping first column to avoid dummy dataframe redundancy
                               dummy_na=True) #making sure a column is formed to show if a source is a null value

#Checking work:
df_dish_type_dum.head()

#Sources: 
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
#https://datascience.stackexchange.com/questions/28353/always-drop-the-first-column-after-performing-one-hot-encoding

Unnamed: 0,dish_type_breakfast,dish_type_dessert,dish_type_dinner,dish_type_lunch,dish_type_nibble,dish_type_nan
0,0,0,1,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,0,0,1,0,0,0


In [206]:
df_dish_type_dum.shape

(22941, 6)

The dish type categorical data was successfully placed in a dummy matrix. It's interesting that there are a few breakfast dish types.  I thought there would only be one breakfast dish type.  It might be interesting to see whether bundling dish types would improve the recommender.

In [207]:
#Saving dish type dummy matrix:
df_dish_type_dum.to_csv('./data/df_dish_type_dum_final.csv')

**Generating dummies dataframe for the ingredients column:**

In [208]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [209]:
#Looking at ingredient data that is embedded in a list:
df_recs['ingredients'].iloc[0]

['gin', 'dry vermouth', 'orange bitters']

In [210]:
#Unpacking an item embedded in a list:
df_recs['ingredients'].iloc[0][0]

'gin'

In [211]:
#Creating a list of all ingredients that are embedded in lists:
ingredients_list_total = []
for ingredient in df_recs['ingredients']:
    ingredients_list_total.extend(ingredient)

In [212]:
#Looking at how many ingredients are in ingredients_list_total:
len(ingredients_list_total)

216506

In [213]:
#Inspecting the first 10 ingredients in ingredients_list_total:
ingredients_list_total[0:10]

['gin',
 'dry vermouth',
 'orange bitters',
 'flour',
 'egg yolks',
 'olive oil',
 'sweet vermouth',
 'gin',
 'campari',
 'eggs']

In [214]:
#Generating a list of unique ingredients:
unique_ingredients = list(np.unique(ingredients_list_total))
unique_ingredients 

#Sources:
#https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
#https://www.geeksforgeeks.org/python-get-unique-values-list/
#https://stackoverflow.com/questions/29459008/how-to-remove-brackets-from-python-string
#https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python

['1 % milk',
 '1 percent low-fat milk',
 '1 percent lowfat milk',
 '1 percent milk',
 '1% cottage cheese',
 '1% lowfat milk',
 '1% milk',
 '1%% Milk',
 '1%% milk',
 '1-percent milk',
 '10x sugar',
 '2 percent milk',
 '2 percent reduced-fat milk',
 '2% Greek yogurt',
 '2% cottage cheese',
 '2% milk',
 '2%% milk',
 '2-percent milk',
 '7-up',
 '85-percent lean ground beef',
 '90 percent lean ground beef',
 '90% lean ground beef',
 '93% -lean ground turkey',
 '93% lean ground turkey',
 '95% lean ground beef',
 'ALL PURPOSE FLOUR',
 'ALL-PURPOSE FLOUR',
 'ATHENOS Traditional Crumbled Feta Cheese',
 'Acini Di Pepe',
 'Acini di pepe',
 'Acorn Squash',
 'Alaskan pollock',
 'Alfredo Pasta Sauce',
 'Alfredo Sauce',
 'Alfredo pasta sauce',
 'Alfredo sauce',
 'All Purpose Flour',
 'All purpose flour',
 'All spices',
 'All-Purpose Flour',
 'All-purpose Flour',
 'All-purpose flour',
 'Almond',
 'Almond extract',
 'Almonds',
 'Amaretti Cookies',
 'Amaretto',
 'American Cheese',
 'American cheese',
 '

In [215]:
#Finding the number of unique ingredients:
len(unique_ingredients)

5417

The ingredients have a lot of variability in title of ingredients.  I think it makes sense to try using ingredient categories to see if the recommender will perform well with just ingredient categories.  If the recommender does not perform well, I'll consider preprocessing (making the text lower case and removing symbols and potentially non-number) the ingredients column data and then using that data instead of the ingredient category data.  

**Generating dummies dataframe for the ingredient categories column:**

In [372]:
#Looking at columns working on creating dummy dataframes for:
df_recs[cols_will_dum].head()

Unnamed: 0,source,diet_labels,health_labels,cautions,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories
0,Serious Eats,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc..."
1,Martha Stewart,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]"
2,Saveur,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc..."
3,Food52,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]"
4,Epicurious,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ..."


In [373]:
#Looking at ingredient category data that is embedded in a list:
df_recs['ingredient_categories'].iloc[0]

['liquors and cocktails', 'wines', 'liquors and cocktails']

In [374]:
#Unpacking an item embedded in a list:
df_recs['ingredient_categories'].iloc[0][0]

'liquors and cocktails'

In [375]:
#Creating a list of all ingredient categories that are embedded in lists:
ingredient_categories_list_total = []
for ingredient_cat in df_recs['ingredient_categories']:
    ingredient_categories_list_total.extend(ingredient_cat)

In [376]:
#Looking at how many ingredients are in ingredients_list_total:
len(ingredient_categories_list_total)

216506

In [377]:
#Inspecting the first 10 ingredient categories in ingredient_categories_list_total:
ingredient_categories_list_total[0:10]

['liquors and cocktails',
 'wines',
 'liquors and cocktails',
 'grains',
 'Eggs',
 'Oils',
 'wines',
 'liquors and cocktails',
 'liquors and cocktails',
 'Eggs']

In [378]:
#Creating a dataframe df_ingr_cats and filling with the ingredient_categories_list_total:
#(doing this because it's easier to do the next manipulation using a dataframe)
df_ingr_cats = pd.DataFrame()
df_ingr_cats['ingredient_categories'] = ingredient_categories_list_total

In [379]:
#Filling None type values with NA values:
for i in range(len(df_ingr_cats['ingredient_categories'])):
    if df_ingr_cats['ingredient_categories'].iloc[i] == None:
        df_ingr_cats['ingredient_categories'].iloc[i] = 'NA'

In [380]:
#Filling None type values with empty strings:
for i in range(len(df_ingr_cats['ingredient_categories'])):
    if type(df_ingr_cats['ingredient_categories'].iloc[i]) != str:
        print(df_ingr_cats['ingredient_categories'].iloc[i]) 

In [381]:
#Finding the index of rows where empty strings are present:
index_no_ingr_cats = df_ingr_cats[df_ingr_cats['ingredient_categories'] == 'NA'].index
index_no_ingr_cats

Int64Index([115805], dtype='int64')

In [382]:
#Removing rows where there are empty strings:
df_ingr_cats.drop(index = index_no_ingr_cats, inplace = True)

In [383]:
#Checking all values in df_ingr_cats['ingredient_categories'] are string type data:
for i in df_ingr_cats['ingredient_categories']:
    if type(i) != str:
        print(i)

In [384]:
#Checking all empty string values in df_ingr_cats['ingredient_categories'] have been removed:
for i in df_ingr_cats['ingredient_categories']:
    if i == 'NA':
        print(i)

In [385]:
#Placing df_ingr_cats['ingredient_categories'] into a list:
ingr_cats_list = list(df_ingr_cats['ingredient_categories'])

In [386]:
#Checking ingr_cats_list is a list:
type(ingr_cats_list)

list

In [387]:
#Generating a list of unique ingredients:
unique_ingredient_categories = list(np.unique(ingr_cats_list))
unique_ingredient_categories 

#Sources:
#https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
#https://www.geeksforgeeks.org/python-get-unique-values-list/
#https://stackoverflow.com/questions/29459008/how-to-remove-brackets-from-python-string
#https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python

['0.0',
 '100% fruit juice',
 '100% juice',
 'Baby food',
 'Cheese',
 'Condiments and sauces',
 'Cured meats',
 'Dairy',
 'Eggs',
 'Frozen poultry',
 'Mexican',
 'Milk',
 'Oils',
 'Plant-based protein',
 'Poultry',
 'Sugars',
 'Vegan products',
 'beer',
 'bread, rolls and tortillas',
 'candy',
 'canned fruit',
 'canned grains',
 'canned meats',
 'canned poultry',
 'canned seafood',
 'canned soup',
 'canned vegetables',
 'cheese',
 'chocolate',
 'cocktails and liquors',
 'coffee and tea',
 'condiments and sauces',
 'cooked grains',
 'crackers',
 'cured meats',
 'dairy',
 'dried fruit and nuts',
 'eggs',
 'flavored water',
 'frozen grained based',
 'frozen grains',
 'frozen poultry',
 'frozen treats',
 'fruit',
 'grains',
 'liquors and cocktails',
 'meats',
 'milk',
 'mixed grains',
 'mixed seafood',
 'mixed soup',
 'non-dairy beverages',
 'oils',
 'pastries',
 'pizza',
 'plant-based protein',
 'poultry',
 'protein and nutritional powders',
 'quick breads and pastries',
 'ready-to-eat ce

In [388]:
#Creating ingredient category dummie column names which will become headings for the ingr_cats dummy dataframe:
ingr_cats_dummies_cols = []
for ingr_cat in unique_ingredient_categories:
    ingr_cats_dummies_cols.append(f'ingredient_category_{ingr_cat}') 
ingr_cats_dummies_cols

['ingredient_category_0.0',
 'ingredient_category_100% fruit juice',
 'ingredient_category_100% juice',
 'ingredient_category_Baby food',
 'ingredient_category_Cheese',
 'ingredient_category_Condiments and sauces',
 'ingredient_category_Cured meats',
 'ingredient_category_Dairy',
 'ingredient_category_Eggs',
 'ingredient_category_Frozen poultry',
 'ingredient_category_Mexican',
 'ingredient_category_Milk',
 'ingredient_category_Oils',
 'ingredient_category_Plant-based protein',
 'ingredient_category_Poultry',
 'ingredient_category_Sugars',
 'ingredient_category_Vegan products',
 'ingredient_category_beer',
 'ingredient_category_bread, rolls and tortillas',
 'ingredient_category_candy',
 'ingredient_category_canned fruit',
 'ingredient_category_canned grains',
 'ingredient_category_canned meats',
 'ingredient_category_canned poultry',
 'ingredient_category_canned seafood',
 'ingredient_category_canned soup',
 'ingredient_category_canned vegetables',
 'ingredient_category_cheese',
 'ingr

In [389]:
#Finding the number of unique ingredient categories:
len(ingr_cats_dummies_cols)

73

There are 73 unique ingredient categories.  Some should probably be combined.  It'll do that once I have an ingredient categories dummy dataframe.

In [390]:
#Instantiating the ingredient categories dummy dataframe:
df_ingr_cats_dum = pd.DataFrame(columns=ingr_cats_dummies_cols)
df_ingr_cats_dum

Unnamed: 0,ingredient_category_0.0,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Cheese,ingredient_category_Condiments and sauces,ingredient_category_Cured meats,ingredient_category_Dairy,ingredient_category_Eggs,ingredient_category_Frozen poultry,ingredient_category_Mexican,ingredient_category_Milk,ingredient_category_Oils,ingredient_category_Plant-based protein,ingredient_category_Poultry,ingredient_category_Sugars,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_cocktails and liquors,ingredient_category_coffee and tea,ingredient_category_condiments and sauces,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_cured meats,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen poultry,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_liquors and cocktails,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_plant-based protein,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar and syrups,ingredient_category_sugar jam,ingredient_category_sugar syrups,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt


In [391]:
df_recs['ingredient_categories'].iloc[0]

['liquors and cocktails', 'wines', 'liquors and cocktails']

In [392]:
#Adding each row's embedded dummy labels to df_ingr_cats_dum 
#(0 if not present in the embedded ingredient categories list, 
#1 if present in the embedded ingredient categories list):
for i in range(len(df_recs['ingredient_categories'])):
    #Instantiating a list of values for the row:
    row = []
    #Looping through each unique ingredient category value, and placing a 1 if the ingredient category embedded 
    #in that row of df_recs['ingredient_categories'] and 0 if not: 
    for ingr_category in unique_ingredient_categories: 
        if ingr_category in df_recs['ingredient_categories'].iloc[i]:
            row.append(1)
        else:
            row.append(0)
    #Adding the row of dummy values to df_cautions_dum at the corresponding location:
    df_ingr_cats_dum.loc[i] = row

In [393]:
#Looking at first five rows of df_recs['ingredient_categories']:
df_recs['ingredient_categories'].head()

0    [liquors and cocktails, wines, liquors and coc...
1                                 [grains, Eggs, Oils]
2    [wines, liquors and cocktails, liquors and coc...
3                [Eggs, grains, Condiments and sauces]
4    [grains, grains, Condiments and sauces, Eggs, ...
Name: ingredient_categories, dtype: object

In [394]:
#Checking the first five rows of df_ingr_cats_dum match with data in the first five rows of 
#df_recs['ingredient_categories']:
df_ingr_cats_dum.head()

Unnamed: 0,ingredient_category_0.0,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Cheese,ingredient_category_Condiments and sauces,ingredient_category_Cured meats,ingredient_category_Dairy,ingredient_category_Eggs,ingredient_category_Frozen poultry,ingredient_category_Mexican,ingredient_category_Milk,ingredient_category_Oils,ingredient_category_Plant-based protein,ingredient_category_Poultry,ingredient_category_Sugars,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_cocktails and liquors,ingredient_category_coffee and tea,ingredient_category_condiments and sauces,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_cured meats,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen poultry,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_liquors and cocktails,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_plant-based protein,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar and syrups,ingredient_category_sugar jam,ingredient_category_sugar syrups,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [395]:
df_ingr_cats_dum.shape

(22941, 73)

The first five rows in df_ingr_cats_dum correspond with the first five rows in df_recs['ingredient_categories'].  The dummy dataframe for ingredient categories appears to have been created properly.  

In [396]:
#Saving ingredient category dummy matrix:
df_ingr_cats_dum.to_csv('./data/df_ingr_cats_dum_final.csv')  

#### Combining df_recs with dummy matrices which were created in this notebook:

In [292]:
df_recs.head()

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,6.0,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,6.0,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,6.0,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924


In [293]:
df_recs.shape

(22941, 45)

In [294]:
df_recs.index

RangeIndex(start=0, stop=22941, step=1)

In [295]:
df_recs.reset_index(drop=True, inplace=True)
#Roy Kim helped with this on 15 May 2019

In [296]:
df_recs.index

RangeIndex(start=0, stop=22941, step=1)

In [297]:
df_source_dum.head()

Unnamed: 0,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [298]:
df_source_dum.shape

(22941, 20)

In [299]:
df_source_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [301]:
df_source_dum.reset_index(drop=True, inplace=True)
df_source_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [302]:
#Merging dataframes by index:
df_recs_src = df_recs.merge(df_source_dum, left_index = True, right_index = True)

#Checking work:
df_recs_src.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html
#https://stackoverflow.com/questions/40468069/merge-two-dataframes-by-index/40468090

Unnamed: 0,title,source,url,yield,diet_labels,health_labels,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan
0,Martini Recipe,Serious Eats,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,Pasta Dough,Martha Stewart,http://www.marthastewart.com/337857/pasta-dough,6.0,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,Classic Negroni Cocktail Recipe,Saveur,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,Simple Fresh Pasta,Food52,http://food52.com/recipes/27825-simple-fresh-p...,6.0,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Egg Noodle,Epicurious,http://www.epicurious.com/recipes/food/views/E...,6.0,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [303]:
#Making sure have 45 + 20 = 65 columns
df_recs_src.shape

(22941, 65)

In [304]:
#Removing source column since now have dummy columns in the dataframe:
df_recs_src.drop(labels='source',
                 axis = 1,
                 inplace = True)

In [305]:
#Making sure have 65 - 1 = 64 columns
df_recs_src.shape

(22941, 64)

In [306]:
df_recs_src.index

RangeIndex(start=0, stop=22941, step=1)

In [307]:
df_diet_labels_dum.head()

Unnamed: 0,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium
0,0,0,0,0,1,1
1,1,0,0,0,0,1
2,0,0,0,0,1,1
3,0,0,0,0,1,0
4,0,0,0,0,1,0


In [308]:
df_diet_labels_dum.shape

(22941, 6)

In [310]:
df_diet_labels_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [309]:
df_diet_labels_dum.reset_index(drop=True, inplace=True)
df_diet_labels_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [311]:
#Merging dataframes by index:
df_recs_src_dt = df_recs_src.merge(df_diet_labels_dum, left_index = True, right_index = True)

#Checking work:
df_recs_src_dt.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html
#https://stackoverflow.com/questions/40468069/merge-two-dataframes-by-index/40468090

Unnamed: 0,title,url,yield,diet_labels,health_labels,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,"[Balanced, Low-Sodium]","[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"[Low-Fat, Low-Sodium]","[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,[Low-Fat],"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [312]:
#Making sure have 64 + 6 = 70 columns:
df_recs_src_dt.shape

(22941, 70)

In [313]:
#Removing diet_labels column since now have dummy columns in the dataframe:
df_recs_src_dt.drop(labels='diet_labels',
                    axis = 1,
                    inplace = True)

In [314]:
#Making sure have 70 - 1 = 69 columns:
df_recs_src_dt.shape

(22941, 69)

In [315]:
df_health_labels_dum.head()

Unnamed: 0,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free
0,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0
2,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
3,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0
4,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0


In [316]:
df_health_labels_dum.shape

(22941, 30)

In [317]:
df_health_labels_dum.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            22931, 22932, 22933, 22934, 22935, 22936, 22937, 22938, 22939,
            22940],
           dtype='int64', length=22941)

In [318]:
df_health_labels_dum.reset_index(drop=True, inplace=True)
df_health_labels_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [319]:
df_recs_src_dt.index

RangeIndex(start=0, stop=22941, step=1)

In [320]:
df_recs_src_dt_hlt = df_recs_src_dt.merge(df_health_labels_dum, left_index = True, right_index = True) 

#Checking work:
df_recs_src_dt_hlt.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

Unnamed: 0,title,url,yield,health_labels,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,"[Sugar-Conscious, Low Potassium, Vegetarian, P...",[],603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,"[Sugar-Conscious, Low Potassium, Kidney-Friend...",[],559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0


In [321]:
#Checking have 69 + 30 = 99 columns
df_recs_src_dt_hlt.shape

(22941, 99)

In [322]:
#Removing health_labels column since now have dummy columns in the dataframe:
df_recs_src_dt_hlt.drop(labels='health_labels',
                        axis = 1,
                        inplace = True)

In [324]:
#Checking have 99 - 1 = 98 columns
df_recs_src_dt_hlt.shape

(22941, 98)

In [325]:
df_cautions_dum.head()

Unnamed: 0,cautions_Eggs,cautions_FODMAP,cautions_Gluten,cautions_Milk,cautions_Peanuts,cautions_Shellfish,cautions_Soy,cautions_Sulfites,cautions_Tree-Nuts,cautions_Wheat
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0


In [326]:
df_health_labels_dum.columns

Index(['health_labels_Alcohol-Cocktail', 'health_labels_Alcohol-Free',
       'health_labels_Celery-Free', 'health_labels_Crustacean-Free',
       'health_labels_Dairy-Free', 'health_labels_Egg-Free',
       'health_labels_Fish-Free', 'health_labels_Gluten-Free',
       'health_labels_Keto-Friendly', 'health_labels_Kidney-Friendly',
       'health_labels_Kosher', 'health_labels_Low Potassium',
       'health_labels_Low Sugar', 'health_labels_Lupine-Free',
       'health_labels_Mollusk-Free', 'health_labels_Mustard-Free',
       'health_labels_No oil added', 'health_labels_Paleo',
       'health_labels_Peanut-Free', 'health_labels_Pescatarian',
       'health_labels_Pork-Free', 'health_labels_Red-Meat-Free',
       'health_labels_Sesame-Free', 'health_labels_Shellfish-Free',
       'health_labels_Soy-Free', 'health_labels_Sugar-Conscious',
       'health_labels_Tree-Nut-Free', 'health_labels_Vegan',
       'health_labels_Vegetarian', 'health_labels_Wheat-Free'],
      dtype='object')

In [327]:
df_cautions_dum.columns

Index(['cautions_Eggs', 'cautions_FODMAP', 'cautions_Gluten', 'cautions_Milk',
       'cautions_Peanuts', 'cautions_Shellfish', 'cautions_Soy',
       'cautions_Sulfites', 'cautions_Tree-Nuts', 'cautions_Wheat'],
      dtype='object')

It is redundant to include cautions for eggs because if health_labels_Egg-Free = 1, then cautions_Eggs would have a 0, etc.  This is true for the following cautions:
- cautions_Eggs
- cautions_Gluten
- cautions_Milk
- cautions_Peanuts
- cautions_Shellfish
- cautions_Soy
- cautions_Tree-Nuts
- cautions_Wheat

The cautions that will still be used are:
- cautions_FODMAP
- cautions_Sulfites

In [328]:
df_recs_src_dt_hlt.head()

Unnamed: 0,title,url,yield,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,[],85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,[],603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,[],85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,[],452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,[],559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0


In [329]:
df_cautions_dum_use = df_cautions_dum[['cautions_FODMAP', 'cautions_Sulfites']]
df_cautions_dum_use.head()

Unnamed: 0,cautions_FODMAP,cautions_Sulfites
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [330]:
df_cautions_dum_use.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            22931, 22932, 22933, 22934, 22935, 22936, 22937, 22938, 22939,
            22940],
           dtype='int64', length=22941)

In [331]:
df_cautions_dum_use.reset_index(drop=True, inplace=True)
df_cautions_dum_use.index

RangeIndex(start=0, stop=22941, step=1)

In [332]:
df_recs_src_dt_hlt.index

RangeIndex(start=0, stop=22941, step=1)

In [333]:
df_recs_src_dt_hlt_ctns = df_recs_src_dt_hlt.merge(df_cautions_dum_use, left_index = True, right_index = True) 

#Checking work:
df_recs_src_dt_hlt_ctns.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

Unnamed: 0,title,url,yield,cautions,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free,cautions_FODMAP,cautions_Sulfites
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,[],85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,[],603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,[],85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,[],452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,[],559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0


In [334]:
#Checking have 98 + 2 = 100 columns
df_recs_src_dt_hlt_ctns.shape

(22941, 100)

In [335]:
#Removing cautions column since now have dummy columns in the dataframe:
df_recs_src_dt_hlt_ctns.drop(labels='cautions',
                             axis = 1,
                             inplace = True)

In [336]:
#Checking have 100 - 1 = 99 columns
df_recs_src_dt_hlt_ctns.shape

(22941, 99)

In [337]:
df_cuisine_type_dum.head()

Unnamed: 0,cuisine_type_.,cuisine_type_4 Points,cuisine_type_African,cuisine_type_Albanian,cuisine_type_Amerian,cuisine_type_American,"cuisine_type_American, Barbecue, Southern","cuisine_type_American, French","cuisine_type_American, Italian","cuisine_type_American, Southern",cuisine_type_American-South,cuisine_type_American|Asian,cuisine_type_American|North American|Cajun/Creole,cuisine_type_Amish,cuisine_type_Andhra,cuisine_type_Appetizer,cuisine_type_Arab,cuisine_type_Argentinian,cuisine_type_Armenian,cuisine_type_Asian,"cuisine_type_Asian, Chinese",cuisine_type_Aussie,cuisine_type_Australian,cuisine_type_Australian/New Zealand,cuisine_type_Austrian,cuisine_type_Baking,cuisine_type_Balochi,cuisine_type_Bangladeshi,cuisine_type_Belarusian,cuisine_type_Belgian,cuisine_type_Belgium,cuisine_type_Brazilian,cuisine_type_Breakfast,cuisine_type_British,cuisine_type_BritishEurope,cuisine_type_Bulgarian,cuisine_type_Cajun,cuisine_type_Cajun/Creole|Southern,cuisine_type_Canadian,cuisine_type_Caribbean,cuisine_type_Central American/Caribbean,cuisine_type_Cheap,cuisine_type_Chechen,cuisine_type_Chinese,cuisine_type_Colombian,cuisine_type_Cookie,cuisine_type_Costa Rican,cuisine_type_Creole,cuisine_type_Cuban,cuisine_type_Cuban|Central American/Caribbean,...,cuisine_type_Portuguese,cuisine_type_Puerto Rican,cuisine_type_Raisine,cuisine_type_Romanian,cuisine_type_Russian,cuisine_type_Sandwiches,cuisine_type_Scandanavian,cuisine_type_Scandinavian,cuisine_type_Scottish,cuisine_type_Seafood,cuisine_type_Sichuan Chinese,cuisine_type_Singaporean,cuisine_type_Slovak,cuisine_type_Slow cooker,cuisine_type_Soup,cuisine_type_South American,cuisine_type_Southern,cuisine_type_Southern American,cuisine_type_Spanish,cuisine_type_Sri Lankan,cuisine_type_Swedish,cuisine_type_Swiss,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Thailand,cuisine_type_Traditional English,cuisine_type_Turkish,cuisine_type_Uncategorized,cuisine_type_Vancouver,cuisine_type_Vegan,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Zanzibari,cuisine_type_american,cuisine_type_british,cuisine_type_californian,cuisine_type_chinese,cuisine_type_comfort food,cuisine_type_dessert,cuisine_type_english,cuisine_type_french,cuisine_type_hawaiian,cuisine_type_italian,cuisine_type_low carb,cuisine_type_mediterranean,cuisine_type_mexican,cuisine_type_pakistani,cuisine_type_soup,cuisine_type_western,cuisine_type_nan
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [338]:
df_cuisine_type_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [339]:
df_cuisine_type_dum.reset_index(drop=True, inplace=True)
df_cuisine_type_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [340]:
df_recs_src_dt_hlt_ctns.index

RangeIndex(start=0, stop=22941, step=1)

In [341]:
df_recs_src_dt_hlt_ctns_ct = df_recs_src_dt_hlt_ctns.merge(df_cuisine_type_dum, 
                                                           left_index = True, right_index = True) 

#Checking work:
df_recs_src_dt_hlt_ctns_ct.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

Unnamed: 0,title,url,yield,total_weight,total_time,cuisine_type,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,...,cuisine_type_Portuguese,cuisine_type_Puerto Rican,cuisine_type_Raisine,cuisine_type_Romanian,cuisine_type_Russian,cuisine_type_Sandwiches,cuisine_type_Scandanavian,cuisine_type_Scandinavian,cuisine_type_Scottish,cuisine_type_Seafood,cuisine_type_Sichuan Chinese,cuisine_type_Singaporean,cuisine_type_Slovak,cuisine_type_Slow cooker,cuisine_type_Soup,cuisine_type_South American,cuisine_type_Southern,cuisine_type_Southern American,cuisine_type_Spanish,cuisine_type_Sri Lankan,cuisine_type_Swedish,cuisine_type_Swiss,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Thailand,cuisine_type_Traditional English,cuisine_type_Turkish,cuisine_type_Uncategorized,cuisine_type_Vancouver,cuisine_type_Vegan,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Zanzibari,cuisine_type_american,cuisine_type_british,cuisine_type_californian,cuisine_type_chinese,cuisine_type_comfort food,cuisine_type_dessert,cuisine_type_english,cuisine_type_french,cuisine_type_hawaiian,cuisine_type_italian,cuisine_type_low carb,cuisine_type_mediterranean,cuisine_type_mexican,cuisine_type_pakistani,cuisine_type_soup,cuisine_type_western,cuisine_type_nan
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,Italian,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,Italian,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,American,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,Italian,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,Italian,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


If I had more time, I would have cleaned up the cuisine type columns since some of them overlap.

In [342]:
#Checking have 99 + 179 = 278 columns
df_recs_src_dt_hlt_ctns_ct.shape

(22941, 278)

In [343]:
#Removing cuisine type column since now have dummy columns in the dataframe:
df_recs_src_dt_hlt_ctns_ct.drop(labels='cuisine_type',
                                axis = 1,
                                inplace = True)

In [344]:
#Checking have 278 - 1 = 277 columns
df_recs_src_dt_hlt_ctns_ct.shape

(22941, 277)

In [345]:
df_dish_type_dum.head()

Unnamed: 0,dish_type_breakfast,dish_type_dessert,dish_type_dinner,dish_type_lunch,dish_type_nibble,dish_type_nan
0,0,0,1,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,0,0,1,0,0,0


In [346]:
df_dish_type_dum.shape

(22941, 6)

In [347]:
df_dish_type_dum.columns

Index(['dish_type_breakfast', 'dish_type_dessert', 'dish_type_dinner',
       'dish_type_lunch', 'dish_type_nibble', 'dish_type_nan'],
      dtype='object')

In [348]:
#breakfast_list = df_dish_type_dum['dish_type_Breakfast Casseroles'] + df_dish_type_dum['dish_type_Fruit Breakfast'] + df_dish_type_dum['dish_type_Meat Breakfast'] + df_dish_type_dum['dish_type_Other Breakfast'] + df_dish_type_dum['dish_type_breakfast']


In [762]:
#df_dish_type_dum['dish_type_breakfast'] = breakfast_list

In [763]:
# df_dish_type_dum.drop(labels=['dish_type_Breakfast Casseroles', 
#                               'dish_type_Fruit Breakfast',
#                               'dish_type_Meat Breakfast', 
#                               'dish_type_Other Breakfast'],
#                      axis = 1,
#                      inplace = True)

In [349]:
df_dish_type_dum.columns

Index(['dish_type_breakfast', 'dish_type_dessert', 'dish_type_dinner',
       'dish_type_lunch', 'dish_type_nibble', 'dish_type_nan'],
      dtype='object')

In [350]:
#Removing dish_type_nan because we can assume that if a dish has no other dish type, it is dish_type_nan:
df_dish_type_dum.drop(labels='dish_type_nan',
                      axis = 1,
                      inplace = True)

In [351]:
df_dish_type_dum.columns

Index(['dish_type_breakfast', 'dish_type_dessert', 'dish_type_dinner',
       'dish_type_lunch', 'dish_type_nibble'],
      dtype='object')

In [352]:
df_dish_type_dum.shape

(22941, 5)

In [353]:
df_dish_type_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [354]:
df_dish_type_dum.reset_index(drop=True, inplace=True)
df_dish_type_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [355]:
df_recs_src_dt_hlt_ctns_ct_typ = df_recs_src_dt_hlt_ctns_ct.merge(df_dish_type_dum, 
                                                           left_index = True, right_index = True) 

#Checking work:
df_recs_src_dt_hlt_ctns_ct_typ.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

Unnamed: 0,title,url,yield,total_weight,total_time,meal_type,dish_type,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,...,cuisine_type_Sandwiches,cuisine_type_Scandanavian,cuisine_type_Scandinavian,cuisine_type_Scottish,cuisine_type_Seafood,cuisine_type_Sichuan Chinese,cuisine_type_Singaporean,cuisine_type_Slovak,cuisine_type_Slow cooker,cuisine_type_Soup,cuisine_type_South American,cuisine_type_Southern,cuisine_type_Southern American,cuisine_type_Spanish,cuisine_type_Sri Lankan,cuisine_type_Swedish,cuisine_type_Swiss,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Thailand,cuisine_type_Traditional English,cuisine_type_Turkish,cuisine_type_Uncategorized,cuisine_type_Vancouver,cuisine_type_Vegan,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Zanzibari,cuisine_type_american,cuisine_type_british,cuisine_type_californian,cuisine_type_chinese,cuisine_type_comfort food,cuisine_type_dessert,cuisine_type_english,cuisine_type_french,cuisine_type_hawaiian,cuisine_type_italian,cuisine_type_low carb,cuisine_type_mediterranean,cuisine_type_mexican,cuisine_type_pakistani,cuisine_type_soup,cuisine_type_western,cuisine_type_nan,dish_type_breakfast,dish_type_dessert,dish_type_dinner,dish_type_lunch,dish_type_nibble
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,[cocktail],dinner,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,[pasta],dinner,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,"[cocktail, drink]",dinner,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,[pasta],dinner,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,[noodle],dinner,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [356]:
#Making sure 277 + 5 = 282 columns:
df_recs_src_dt_hlt_ctns_ct_typ.shape

(22941, 282)

In [357]:
#Removing dish type column since now have dummy columns in the dataframe:
df_recs_src_dt_hlt_ctns_ct_typ.drop(labels='dish_type',
                                    axis = 1,
                                    inplace = True)

In [358]:
#Removing meal type column because didn't find the data clean enough to use yet:
df_recs_src_dt_hlt_ctns_ct_typ.drop(labels='meal_type',
                                    axis = 1,
                                    inplace = True)

In [397]:
#Making sure 282 - 2 = 280 columns:
df_recs_src_dt_hlt_ctns_ct_typ.shape

(22941, 280)

In [398]:
df_ingr_cats_dum.head()

Unnamed: 0,ingredient_category_0.0,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Cheese,ingredient_category_Condiments and sauces,ingredient_category_Cured meats,ingredient_category_Dairy,ingredient_category_Eggs,ingredient_category_Frozen poultry,ingredient_category_Mexican,ingredient_category_Milk,ingredient_category_Oils,ingredient_category_Plant-based protein,ingredient_category_Poultry,ingredient_category_Sugars,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_cocktails and liquors,ingredient_category_coffee and tea,ingredient_category_condiments and sauces,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_cured meats,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen poultry,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_liquors and cocktails,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_plant-based protein,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar and syrups,ingredient_category_sugar jam,ingredient_category_sugar syrups,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [399]:
df_ingr_cats_dum.columns

Index(['ingredient_category_0.0', 'ingredient_category_100% fruit juice',
       'ingredient_category_100% juice', 'ingredient_category_Baby food',
       'ingredient_category_Cheese',
       'ingredient_category_Condiments and sauces',
       'ingredient_category_Cured meats', 'ingredient_category_Dairy',
       'ingredient_category_Eggs', 'ingredient_category_Frozen poultry',
       'ingredient_category_Mexican', 'ingredient_category_Milk',
       'ingredient_category_Oils', 'ingredient_category_Plant-based protein',
       'ingredient_category_Poultry', 'ingredient_category_Sugars',
       'ingredient_category_Vegan products', 'ingredient_category_beer',
       'ingredient_category_bread, rolls and tortillas',
       'ingredient_category_candy', 'ingredient_category_canned fruit',
       'ingredient_category_canned grains', 'ingredient_category_canned meats',
       'ingredient_category_canned poultry',
       'ingredient_category_canned seafood', 'ingredient_category_canned soup',


In [400]:
#Removing ingredient_category_0.0 column from df_ingr_cats_dum because it doesn't make sense,
#and if no other ingredients are present, perhaps the only ingredient category is category 0.0:
df_ingr_cats_dum.drop(labels='ingredient_category_0.0',
                      axis = 1,
                      inplace = True)

In [401]:
df_ingr_cats_dum.shape

(22941, 72)

In [402]:
#Combining overlapping columns and removing redundant columns:
df_ingr_cats_dum['ingredient_category_cheese'] = df_ingr_cats_dum['ingredient_category_Cheese'] + df_ingr_cats_dum['ingredient_category_cheese'] 
df_ingr_cats_dum['ingredient_category_condiments_and_sauces'] = df_ingr_cats_dum['ingredient_category_Condiments and sauces'] + df_ingr_cats_dum['ingredient_category_condiments and sauces']
df_ingr_cats_dum['ingredient_category_cured_meats'] = df_ingr_cats_dum['ingredient_category_Cured meats'] + df_ingr_cats_dum['ingredient_category_cured meats']
df_ingr_cats_dum['ingredient_category_dairy'] = df_ingr_cats_dum['ingredient_category_Dairy'] + df_ingr_cats_dum['ingredient_category_dairy']
df_ingr_cats_dum['ingredient_category_eggs'] = df_ingr_cats_dum['ingredient_category_Eggs'] + df_ingr_cats_dum['ingredient_category_eggs']
df_ingr_cats_dum['ingredient_category_frozen_poultry'] = df_ingr_cats_dum['ingredient_category_Frozen poultry'] + df_ingr_cats_dum['ingredient_category_frozen poultry']
df_ingr_cats_dum['ingredient_category_milk'] = df_ingr_cats_dum['ingredient_category_Milk'] + df_ingr_cats_dum['ingredient_category_milk']
df_ingr_cats_dum['ingredient_category_oils'] = df_ingr_cats_dum['ingredient_category_Oils'] + df_ingr_cats_dum['ingredient_category_oils']
df_ingr_cats_dum['ingredient_category_plant_based_protein'] = df_ingr_cats_dum['ingredient_category_Plant-based protein'] + df_ingr_cats_dum['ingredient_category_plant-based protein']
df_ingr_cats_dum['ingredient_category_poultry'] = df_ingr_cats_dum['ingredient_category_Poultry'] + df_ingr_cats_dum['ingredient_category_poultry']
df_ingr_cats_dum['ingredient_category_sugars'] = df_ingr_cats_dum['ingredient_category_Sugars'] + df_ingr_cats_dum['ingredient_category_sugars']
df_ingr_cats_dum['ingredient_category_liquors_and_cocktails'] = df_ingr_cats_dum['ingredient_category_cocktails and liquors'] + df_ingr_cats_dum['ingredient_category_liquors and cocktails']
df_ingr_cats_dum['ingredient_category_sugar_and_syrups'] = df_ingr_cats_dum['ingredient_category_sugar syrups'] + df_ingr_cats_dum['ingredient_category_sugar and syrups']


df_ingr_cats_dum.drop(labels = ['ingredient_category_Cheese',
                                'ingredient_category_Condiments and sauces',
                                'ingredient_category_condiments and sauces',
                                'ingredient_category_Cured meats',
                                'ingredient_category_cured meats',
                                'ingredient_category_Dairy',
                                'ingredient_category_Eggs',
                                'ingredient_category_Frozen poultry',
                                'ingredient_category_frozen poultry',
                                'ingredient_category_Milk',
                                'ingredient_category_Oils',
                                'ingredient_category_Plant-based protein',
                                'ingredient_category_plant-based protein',
                                'ingredient_category_Poultry',
                                'ingredient_category_Sugars',
                                'ingredient_category_cocktails and liquors',
                                'ingredient_category_liquors and cocktails',
                                'ingredient_category_sugar and syrups',
                                'ingredient_category_sugar syrups'],
                      axis = 1,
                      inplace = True)

In [403]:
df_ingr_cats_dum.head()

Unnamed: 0,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Mexican,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_coffee and tea,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar jam,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [404]:
#Checking have 72 - 13 = 59 columns:
df_ingr_cats_dum.shape

(22941, 59)

In [405]:
df_ingr_cats_dum.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            22931, 22932, 22933, 22934, 22935, 22936, 22937, 22938, 22939,
            22940],
           dtype='int64', length=22941)

In [406]:
df_ingr_cats_dum.reset_index(drop=True, inplace=True)
df_ingr_cats_dum.index

RangeIndex(start=0, stop=22941, step=1)

In [407]:
df_recs_src_dt_hlt_ctns_ct_typ.shape

(22941, 280)

In [408]:
df_recs_src_dt_hlt_ctns_ct_typ.index

RangeIndex(start=0, stop=22941, step=1)

In [409]:
df_recs_src_dt_hlt_ctns_ct_typ_ingr = df_recs_src_dt_hlt_ctns_ct_typ.merge(df_ingr_cats_dum, 
                                                           left_index = True, right_index = True) 

#Checking work:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.head()

#Source:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

Unnamed: 0,title,url,yield,total_weight,total_time,ingredients,ingredient_categories,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,...,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_coffee and tea,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar jam,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,"[gin, dry vermouth, orange bitters]","[liquors and cocktails, wines, liquors and coc...",87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,"[flour, egg yolks, olive oil]","[grains, Eggs, Oils]",354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,"[sweet vermouth, gin, campari]","[wines, liquors and cocktails, liquors and coc...",81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,"[eggs, flour, salt]","[Eggs, grains, Condiments and sauces]",217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,"[all-purpose flour, semolina flour, salt, eggs...","[grains, grains, Condiments and sauces, Eggs, ...",301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [410]:
#Checking have 280 + 59 = 339 columns:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.shape

(22941, 339)

In [411]:
#Removing ingredients and ingredient categories columns since now have dummy columns
#related to those columns in the dataframe:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.drop(labels=['ingredients', 'ingredient_categories'],
                                         axis = 1,
                                         inplace = True)

In [412]:
#Making sure have 339 - 2 = 337 columns remaining
df_recs_src_dt_hlt_ctns_ct_typ_ingr.shape

(22941, 337)

In [413]:
#Finding the number of columns with each datatype:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.dtypes.value_counts()

#Source:
#https://stackoverflow.com/questions/32337380/count-data-types-in-pandas-dataframe

uint8      204
object      99
float64     34
dtype: int64

In [414]:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.head()

Unnamed: 0,title,url,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,...,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_coffee and tea,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar jam,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [416]:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.select_dtypes(include = object).head()

Unnamed: 0,title,url,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free,cautions_FODMAP,cautions_Sulfites,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Mexican,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_coffee and tea,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar jam,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,1,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [417]:
#Looking at colemns with object type data:
cols_obj_numerical = df_recs_src_dt_hlt_ctns_ct_typ_ingr.select_dtypes(include = object).columns.drop(
                        labels = ['title', 'url'])
cols_obj_numerical

#Source: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html

Index(['diet_labels_Balanced', 'diet_labels_High-Fiber',
       'diet_labels_High-Protein', 'diet_labels_Low-Carb',
       'diet_labels_Low-Fat', 'diet_labels_Low-Sodium',
       'health_labels_Alcohol-Cocktail', 'health_labels_Alcohol-Free',
       'health_labels_Celery-Free', 'health_labels_Crustacean-Free',
       'health_labels_Dairy-Free', 'health_labels_Egg-Free',
       'health_labels_Fish-Free', 'health_labels_Gluten-Free',
       'health_labels_Keto-Friendly', 'health_labels_Kidney-Friendly',
       'health_labels_Kosher', 'health_labels_Low Potassium',
       'health_labels_Low Sugar', 'health_labels_Lupine-Free',
       'health_labels_Mollusk-Free', 'health_labels_Mustard-Free',
       'health_labels_No oil added', 'health_labels_Paleo',
       'health_labels_Peanut-Free', 'health_labels_Pescatarian',
       'health_labels_Pork-Free', 'health_labels_Red-Meat-Free',
       'health_labels_Sesame-Free', 'health_labels_Shellfish-Free',
       'health_labels_Soy-Free', 'health_la

In [418]:
for col in cols_obj_numerical:
    df_recs_src_dt_hlt_ctns_ct_typ_ingr[col] = df_recs_src_dt_hlt_ctns_ct_typ_ingr[col].astype(int)

In [419]:
#Finding the number of columns with each datatype:
df_recs_src_dt_hlt_ctns_ct_typ_ingr.dtypes.value_counts()

#Source:
#https://stackoverflow.com/questions/32337380/count-data-types-in-pandas-dataframe

uint8      204
int64       97
float64     34
object       2
dtype: int64

In [420]:
df_recs_clean = df_recs_src_dt_hlt_ctns_ct_typ_ingr

In [421]:
type(df_recs_clean)

pandas.core.frame.DataFrame

In [422]:
df_recs_clean.dtypes.value_counts()

uint8      204
int64       97
float64     34
object       2
dtype: int64

In [423]:
#Saving df_recs before columns will be dropped:
df_recs_clean.to_csv('./data/df_recs_clean_final.csv', index = False)

In [424]:
df_recs_clean.columns

Index(['title', 'url', 'yield', 'total_weight', 'total_time',
       'calories_per_serv', 'total_weight_per_serv', 'total_time_per_serv',
       'monounsat_fat_g_per_serv', 'polyunsat_fat_g_per_serv',
       ...
       'ingredient_category_vegetables', 'ingredient_category_water',
       'ingredient_category_wines', 'ingredient_category_yogurt',
       'ingredient_category_condiments_and_sauces',
       'ingredient_category_cured_meats', 'ingredient_category_frozen_poultry',
       'ingredient_category_plant_based_protein',
       'ingredient_category_liquors_and_cocktails',
       'ingredient_category_sugar_and_syrups'],
      dtype='object', length=337)

In [427]:
#Increasing the number of columns that will be displayed when inspecting the head of a dataframe:
pd.set_option('display.max_columns', 350)
#Source: https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns

In [428]:
df_recs_clean.head()

Unnamed: 0,title,url,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free,cautions_FODMAP,cautions_Sulfites,cuisine_type_.,cuisine_type_4 Points,cuisine_type_African,cuisine_type_Albanian,cuisine_type_Amerian,cuisine_type_American,"cuisine_type_American, Barbecue, Southern","cuisine_type_American, French","cuisine_type_American, Italian","cuisine_type_American, Southern",cuisine_type_American-South,cuisine_type_American|Asian,cuisine_type_American|North American|Cajun/Creole,cuisine_type_Amish,cuisine_type_Andhra,cuisine_type_Appetizer,cuisine_type_Arab,cuisine_type_Argentinian,cuisine_type_Armenian,cuisine_type_Asian,"cuisine_type_Asian, Chinese",cuisine_type_Aussie,cuisine_type_Australian,cuisine_type_Australian/New Zealand,cuisine_type_Austrian,cuisine_type_Baking,cuisine_type_Balochi,cuisine_type_Bangladeshi,cuisine_type_Belarusian,cuisine_type_Belgian,cuisine_type_Belgium,cuisine_type_Brazilian,cuisine_type_Breakfast,cuisine_type_British,cuisine_type_BritishEurope,cuisine_type_Bulgarian,cuisine_type_Cajun,cuisine_type_Cajun/Creole|Southern,cuisine_type_Canadian,cuisine_type_Caribbean,cuisine_type_Central American/Caribbean,cuisine_type_Cheap,cuisine_type_Chechen,cuisine_type_Chinese,cuisine_type_Colombian,cuisine_type_Cookie,cuisine_type_Costa Rican,cuisine_type_Creole,cuisine_type_Cuban,cuisine_type_Cuban|Central American/Caribbean,cuisine_type_Curry,cuisine_type_Danish,cuisine_type_Dessert,cuisine_type_Dominican,cuisine_type_Dutch,cuisine_type_Earth,cuisine_type_Eastern Europe,cuisine_type_Eastern European,cuisine_type_Ecuadorian,cuisine_type_Egypt,cuisine_type_English,cuisine_type_Estern,cuisine_type_Europe,cuisine_type_European,"cuisine_type_European, French, Paris","cuisine_type_European, Italian","cuisine_type_European, Scottish",cuisine_type_Filipino,cuisine_type_France,cuisine_type_French,cuisine_type_GAPS,cuisine_type_German,cuisine_type_Goan,cuisine_type_Gourment,cuisine_type_Greek,cuisine_type_Hawaiian,cuisine_type_Honduran,cuisine_type_Hungarian,cuisine_type_Iatlian,cuisine_type_Icelandic,cuisine_type_Icings,cuisine_type_Illinois,cuisine_type_Indian,cuisine_type_Indonesian,cuisine_type_Irish,cuisine_type_Italian,cuisine_type_Italian American,cuisine_type_Jamaican,cuisine_type_Jamaicann,cuisine_type_Japanese,cuisine_type_Jewish,cuisine_type_Korean,cuisine_type_Latin,cuisine_type_Latin America,cuisine_type_Latin American,cuisine_type_Low Carb,cuisine_type_Magic,cuisine_type_Malay,cuisine_type_Malaysian,cuisine_type_Meat,cuisine_type_Mediteranian,cuisine_type_Mediterranean,cuisine_type_Mediterrenian,cuisine_type_Meditteranean,cuisine_type_Mexican,cuisine_type_Mexican Seafood,cuisine_type_Mexican/American,cuisine_type_Mexican|Latin American,cuisine_type_Modern,cuisine_type_Mordovian,cuisine_type_Morocan,cuisine_type_Moroccan,cuisine_type_Muffin,cuisine_type_Netherlands,cuisine_type_New England,cuisine_type_Nordic,cuisine_type_North African,cuisine_type_Norwegian,cuisine_type_Oriental,cuisine_type_Pakistani,cuisine_type_Paleo,cuisine_type_Pastry,cuisine_type_Pennsylvania Dutch,cuisine_type_Persian,cuisine_type_Peruvian,cuisine_type_Philippines,cuisine_type_Pizza,cuisine_type_Points Plus,cuisine_type_Polish,cuisine_type_Portuguese,cuisine_type_Puerto Rican,cuisine_type_Raisine,cuisine_type_Romanian,cuisine_type_Russian,cuisine_type_Sandwiches,cuisine_type_Scandanavian,cuisine_type_Scandinavian,cuisine_type_Scottish,cuisine_type_Seafood,cuisine_type_Sichuan Chinese,cuisine_type_Singaporean,cuisine_type_Slovak,cuisine_type_Slow cooker,cuisine_type_Soup,cuisine_type_South American,cuisine_type_Southern,cuisine_type_Southern American,cuisine_type_Spanish,cuisine_type_Sri Lankan,cuisine_type_Swedish,cuisine_type_Swiss,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Thailand,cuisine_type_Traditional English,cuisine_type_Turkish,cuisine_type_Uncategorized,cuisine_type_Vancouver,cuisine_type_Vegan,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Zanzibari,cuisine_type_american,cuisine_type_british,cuisine_type_californian,cuisine_type_chinese,cuisine_type_comfort food,cuisine_type_dessert,cuisine_type_english,cuisine_type_french,cuisine_type_hawaiian,cuisine_type_italian,cuisine_type_low carb,cuisine_type_mediterranean,cuisine_type_mexican,cuisine_type_pakistani,cuisine_type_soup,cuisine_type_western,cuisine_type_nan,dish_type_breakfast,dish_type_dessert,dish_type_dinner,dish_type_lunch,dish_type_nibble,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Mexican,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_coffee and tea,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar jam,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


Each serving:
Want higher percent DV in nutrients you want to get more of and lower percent DV in nutrients you want to get less of. 
As a general rule:
5% DV (0.05 proportion of DV) or less of a nutrient per serving is low
20% DV (0.20 proportion of DV) or more of a nutrient per serving is high

Source: https://www.accessdata.fda.gov/scripts/InteractiveNutritionFactsLabel/pdv.html

In [429]:
df_recs_clean.select_dtypes(include = float).columns

Index(['yield', 'total_weight', 'total_time', 'calories_per_serv',
       'total_weight_per_serv', 'total_time_per_serv',
       'monounsat_fat_g_per_serv', 'polyunsat_fat_g_per_serv',
       'trans_fat_g_per_serv', 'sugar_g_per_serv',
       'calcium_pct_div100_per_serv', 'carbs_pct_div100_per_serv',
       'cholesterol_pct_div100_per_serv', 'energy_pct_div100_per_serv',
       'sat_fat_pct_div100_per_serv', 'fat_pct_div100_per_serv',
       'iron_pct_div100_per_serv', 'fiber_pct_div100_per_serv',
       'folate_pct_div100_per_serv', 'potassium_pct_div100_per_serv',
       'magnesium_pct_div100_per_serv', 'sodium_pct_div100_per_serv',
       'niacin_pct_div100_per_serv', 'phosphorus_pct_div100_per_serv',
       'protein_pct_div100_per_serv', 'riboflavin_pct_div100_per_serv',
       'thiamin_pct_div100_per_serv', 'vit_A_pct_div100_per_serv',
       'vit_B6_pct_div100_per_serv', 'vit_B12_pct_div100_per_serv',
       'vit_C_pct_div100_per_serv', 'vit_D_pct_div100_per_serv',
       'vit_E

In [445]:
nutrient_score_list = []

for i in range(len(df_recs_clean['url'])):
    #Initializing counter:
    count = 0
    
    #Nutrients want more of:
    if df_recs_clean['calcium_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1
    if df_recs_clean['iron_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1
    if df_recs_clean['fiber_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['folate_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1    
    if df_recs_clean['potassium_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['magnesium_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1   
    if df_recs_clean['niacin_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['phosphorus_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1
    if df_recs_clean['riboflavin_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['thiamin_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1  
    if df_recs_clean['vit_A_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['vit_B6_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['vit_B12_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['vit_C_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['vit_D_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['vit_E_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1 
    if df_recs_clean['vit_K_pct_div100_per_serv'].iloc[i] > 0.20:
        count += 1                                                        
                                                           
    #Nutrients need but shouldn't overdo:                                                       
    if df_recs_clean['carbs_pct_div100_per_serv'].iloc[i] < 0.20:
        count += 1
    if df_recs_clean['cholesterol_pct_div100_per_serv'].iloc[i] < 0.20:
        count += 1
    if df_recs_clean['sat_fat_pct_div100_per_serv'].iloc[i] < 0.20:
        count += 1
    if df_recs_clean['fat_pct_div100_per_serv'].iloc[i] < 0.20:
        count += 1  
    if df_recs_clean['sodium_pct_div100_per_serv'].iloc[i] < 0.20:
        count += 1                                                        
    
    #Nutrients need but shouldn't overdo (more calculations than < 0.20 here):                                                       
    if df_recs_clean['energy_pct_div100_per_serv'].iloc[i] < 0.33:
        count += 1 
    if df_recs_clean['protein_pct_div100_per_serv'].iloc[i] > 0.20 and df_recs_clean['protein_pct_div100_per_serv'].iloc[i] < 0.33:
        count += 1 
    if df_recs_clean['monounsat_fat_g_per_serv'].iloc[i] <  (0.3*2000/9)/3 : #avoiding overdoing unsat. fat
        count += 1
    if df_recs_clean['polyunsat_fat_g_per_serv'].iloc[i] <  (0.3*2000/9)/3 : #avoiding overdoing unsat. fat
        count += 1
    
    #Nutrients to try to avoid:                                                       
    if df_recs_clean['trans_fat_g_per_serv'].iloc[i] == 0: #avoiding trans fat
        count += 1
    if df_recs_clean['sugar_g_per_serv'].iloc[i] == 0: #avoiding added sugar
        count += 1  
                                                           
    nutrient_score_list.append(count)                                                       
    

In [448]:
len(nutrient_score_list)

22941

In [446]:
df_recs_clean['nutrition_score'] = nutrient_score_list

In [447]:
df_recs_clean.head()

Unnamed: 0,title,url,yield,total_weight,total_time,calories_per_serv,total_weight_per_serv,total_time_per_serv,monounsat_fat_g_per_serv,polyunsat_fat_g_per_serv,trans_fat_g_per_serv,sugar_g_per_serv,calcium_pct_div100_per_serv,carbs_pct_div100_per_serv,cholesterol_pct_div100_per_serv,energy_pct_div100_per_serv,sat_fat_pct_div100_per_serv,fat_pct_div100_per_serv,iron_pct_div100_per_serv,fiber_pct_div100_per_serv,folate_pct_div100_per_serv,potassium_pct_div100_per_serv,magnesium_pct_div100_per_serv,sodium_pct_div100_per_serv,niacin_pct_div100_per_serv,phosphorus_pct_div100_per_serv,protein_pct_div100_per_serv,riboflavin_pct_div100_per_serv,thiamin_pct_div100_per_serv,vit_A_pct_div100_per_serv,vit_B6_pct_div100_per_serv,vit_B12_pct_div100_per_serv,vit_C_pct_div100_per_serv,vit_D_pct_div100_per_serv,vit_E_pct_div100_per_serv,vit_K_pct_div100_per_serv,source_BigOven,source_Cookstr,source_Delish,source_Epicurious,source_Food Network,source_Food52,source_Foodista,source_Good Housekeeping,source_Group Recipes,source_Kitchen Daily,source_Kraft Foods,source_Martha Stewart,source_My Recipes,source_Saveur,source_Serious Eats,source_Taste of Home,source_Williams-Sonoma,source_food.com,source_recipezaar.com,source_nan,diet_labels_Balanced,diet_labels_High-Fiber,diet_labels_High-Protein,diet_labels_Low-Carb,diet_labels_Low-Fat,diet_labels_Low-Sodium,health_labels_Alcohol-Cocktail,health_labels_Alcohol-Free,health_labels_Celery-Free,health_labels_Crustacean-Free,health_labels_Dairy-Free,health_labels_Egg-Free,health_labels_Fish-Free,health_labels_Gluten-Free,health_labels_Keto-Friendly,health_labels_Kidney-Friendly,health_labels_Kosher,health_labels_Low Potassium,health_labels_Low Sugar,health_labels_Lupine-Free,health_labels_Mollusk-Free,health_labels_Mustard-Free,health_labels_No oil added,health_labels_Paleo,health_labels_Peanut-Free,health_labels_Pescatarian,health_labels_Pork-Free,health_labels_Red-Meat-Free,health_labels_Sesame-Free,health_labels_Shellfish-Free,health_labels_Soy-Free,health_labels_Sugar-Conscious,health_labels_Tree-Nut-Free,health_labels_Vegan,health_labels_Vegetarian,health_labels_Wheat-Free,cautions_FODMAP,cautions_Sulfites,cuisine_type_.,cuisine_type_4 Points,cuisine_type_African,cuisine_type_Albanian,cuisine_type_Amerian,cuisine_type_American,"cuisine_type_American, Barbecue, Southern","cuisine_type_American, French","cuisine_type_American, Italian","cuisine_type_American, Southern",cuisine_type_American-South,cuisine_type_American|Asian,cuisine_type_American|North American|Cajun/Creole,cuisine_type_Amish,cuisine_type_Andhra,cuisine_type_Appetizer,cuisine_type_Arab,cuisine_type_Argentinian,cuisine_type_Armenian,cuisine_type_Asian,"cuisine_type_Asian, Chinese",cuisine_type_Aussie,cuisine_type_Australian,cuisine_type_Australian/New Zealand,cuisine_type_Austrian,cuisine_type_Baking,cuisine_type_Balochi,cuisine_type_Bangladeshi,cuisine_type_Belarusian,cuisine_type_Belgian,cuisine_type_Belgium,cuisine_type_Brazilian,cuisine_type_Breakfast,cuisine_type_British,cuisine_type_BritishEurope,cuisine_type_Bulgarian,cuisine_type_Cajun,cuisine_type_Cajun/Creole|Southern,cuisine_type_Canadian,cuisine_type_Caribbean,cuisine_type_Central American/Caribbean,cuisine_type_Cheap,cuisine_type_Chechen,cuisine_type_Chinese,cuisine_type_Colombian,cuisine_type_Cookie,cuisine_type_Costa Rican,cuisine_type_Creole,cuisine_type_Cuban,cuisine_type_Cuban|Central American/Caribbean,cuisine_type_Curry,cuisine_type_Danish,cuisine_type_Dessert,cuisine_type_Dominican,cuisine_type_Dutch,cuisine_type_Earth,cuisine_type_Eastern Europe,cuisine_type_Eastern European,cuisine_type_Ecuadorian,cuisine_type_Egypt,cuisine_type_English,cuisine_type_Estern,cuisine_type_Europe,cuisine_type_European,"cuisine_type_European, French, Paris","cuisine_type_European, Italian","cuisine_type_European, Scottish",cuisine_type_Filipino,cuisine_type_France,cuisine_type_French,cuisine_type_GAPS,cuisine_type_German,cuisine_type_Goan,cuisine_type_Gourment,cuisine_type_Greek,cuisine_type_Hawaiian,cuisine_type_Honduran,cuisine_type_Hungarian,cuisine_type_Iatlian,cuisine_type_Icelandic,cuisine_type_Icings,cuisine_type_Illinois,cuisine_type_Indian,cuisine_type_Indonesian,cuisine_type_Irish,cuisine_type_Italian,cuisine_type_Italian American,cuisine_type_Jamaican,cuisine_type_Jamaicann,cuisine_type_Japanese,cuisine_type_Jewish,cuisine_type_Korean,cuisine_type_Latin,cuisine_type_Latin America,cuisine_type_Latin American,cuisine_type_Low Carb,cuisine_type_Magic,cuisine_type_Malay,cuisine_type_Malaysian,cuisine_type_Meat,cuisine_type_Mediteranian,cuisine_type_Mediterranean,cuisine_type_Mediterrenian,cuisine_type_Meditteranean,cuisine_type_Mexican,cuisine_type_Mexican Seafood,cuisine_type_Mexican/American,cuisine_type_Mexican|Latin American,cuisine_type_Modern,cuisine_type_Mordovian,cuisine_type_Morocan,cuisine_type_Moroccan,cuisine_type_Muffin,cuisine_type_Netherlands,cuisine_type_New England,cuisine_type_Nordic,cuisine_type_North African,cuisine_type_Norwegian,cuisine_type_Oriental,cuisine_type_Pakistani,cuisine_type_Paleo,cuisine_type_Pastry,cuisine_type_Pennsylvania Dutch,cuisine_type_Persian,cuisine_type_Peruvian,cuisine_type_Philippines,cuisine_type_Pizza,cuisine_type_Points Plus,cuisine_type_Polish,cuisine_type_Portuguese,cuisine_type_Puerto Rican,cuisine_type_Raisine,cuisine_type_Romanian,cuisine_type_Russian,cuisine_type_Sandwiches,cuisine_type_Scandanavian,cuisine_type_Scandinavian,cuisine_type_Scottish,cuisine_type_Seafood,cuisine_type_Sichuan Chinese,cuisine_type_Singaporean,cuisine_type_Slovak,cuisine_type_Slow cooker,cuisine_type_Soup,cuisine_type_South American,cuisine_type_Southern,cuisine_type_Southern American,cuisine_type_Spanish,cuisine_type_Sri Lankan,cuisine_type_Swedish,cuisine_type_Swiss,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Thailand,cuisine_type_Traditional English,cuisine_type_Turkish,cuisine_type_Uncategorized,cuisine_type_Vancouver,cuisine_type_Vegan,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Zanzibari,cuisine_type_american,cuisine_type_british,cuisine_type_californian,cuisine_type_chinese,cuisine_type_comfort food,cuisine_type_dessert,cuisine_type_english,cuisine_type_french,cuisine_type_hawaiian,cuisine_type_italian,cuisine_type_low carb,cuisine_type_mediterranean,cuisine_type_mexican,cuisine_type_pakistani,cuisine_type_soup,cuisine_type_western,cuisine_type_nan,dish_type_breakfast,dish_type_dessert,dish_type_dinner,dish_type_lunch,dish_type_nibble,ingredient_category_100% fruit juice,ingredient_category_100% juice,ingredient_category_Baby food,ingredient_category_Mexican,ingredient_category_Vegan products,ingredient_category_beer,"ingredient_category_bread, rolls and tortillas",ingredient_category_candy,ingredient_category_canned fruit,ingredient_category_canned grains,ingredient_category_canned meats,ingredient_category_canned poultry,ingredient_category_canned seafood,ingredient_category_canned soup,ingredient_category_canned vegetables,ingredient_category_cheese,ingredient_category_chocolate,ingredient_category_coffee and tea,ingredient_category_cooked grains,ingredient_category_crackers,ingredient_category_dairy,ingredient_category_dried fruit and nuts,ingredient_category_eggs,ingredient_category_flavored water,ingredient_category_frozen grained based,ingredient_category_frozen grains,ingredient_category_frozen treats,ingredient_category_fruit,ingredient_category_grains,ingredient_category_meats,ingredient_category_milk,ingredient_category_mixed grains,ingredient_category_mixed seafood,ingredient_category_mixed soup,ingredient_category_non-dairy beverages,ingredient_category_oils,ingredient_category_pastries,ingredient_category_pizza,ingredient_category_poultry,ingredient_category_protein and nutritional powders,ingredient_category_quick breads and pastries,ingredient_category_ready-to-eat cereals,ingredient_category_salads,ingredient_category_sandwhiches,ingredient_category_savory snacks,ingredient_category_seafood,ingredient_category_sugar jam,ingredient_category_sugars,ingredient_category_sweetened beverages,ingredient_category_vegetables,ingredient_category_water,ingredient_category_wines,ingredient_category_yogurt,ingredient_category_condiments_and_sauces,ingredient_category_cured_meats,ingredient_category_frozen_poultry,ingredient_category_plant_based_protein,ingredient_category_liquors_and_cocktails,ingredient_category_sugar_and_syrups,nutrition_score
0,Martini Recipe,http://www.seriouseats.com/recipes/2010/06/the...,2.0,85.627736,0.0,87.178569,42.813868,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.043589,0.0,0.0,0.00292,0.0,0.000354,0.002987,0.003712,0.000533,0.001473,0.004066,0.000198,0.002517,0.000605,0.0,0.00589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9
1,Pasta Dough,http://www.marthastewart.com/337857/pasta-dough,6.0,603.5,60.0,354.023333,100.583333,10.0,8.329406,2.791117,0.0,0.429833,0.079372,0.112767,2.049444,0.177012,0.289375,0.272272,0.113728,0.045,0.233917,0.022632,0.028571,0.011699,0.033402,0.38,0.26583,0.242974,0.124778,0.239889,0.166667,0.460417,0.0,0.204,0.120658,0.015635,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13
2,Classic Negroni Cocktail Recipe,http://www.saveur.com/article/Recipes/Negroni-...,2.0,85.048569,0.0,81.788374,42.524285,0.0,0.0,0.0,0.0,0.111981,0.001134,0.001285,0.0,0.040894,0.0,0.0,0.003229,0.0,0.000354,0.003046,0.003712,0.000472,0.001586,0.00486,0.000198,0.002944,0.001299,0.0,0.005997,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9
3,Simple Fresh Pasta,http://food52.com/recipes/27825-simple-fresh-p...,6.0,452.134955,0.0,217.75,75.355826,0.0,0.958,0.68425,0.0095,0.2275,0.021585,0.127783,0.31,0.108875,0.04295,0.044115,0.056871,0.054,0.061875,0.018729,0.033342,0.072671,0.040234,0.147857,0.1661,0.103269,0.058333,0.044444,0.049615,0.092708,0.0,0.033333,0.0195,0.001875,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7
4,Egg Noodle,http://www.epicurious.com/recipes/food/views/E...,6.0,559.808863,0.0,301.271778,93.301477,0.0,1.648888,0.943102,0.005447,0.215809,0.030707,0.174975,0.50243,0.150636,0.070593,0.068406,0.154068,0.08842,0.402836,0.0268,0.058148,0.089929,0.212137,0.208924,0.221152,0.262312,0.355459,0.063487,0.07907,0.126097,0.0,0.051431,0.031864,0.001924,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,13


In [449]:
#Saving df_recs before columns will be dropped:
df_recs_clean.to_csv('./data/df_recs_clean_final_nutr_sc.csv', index = False)