# Cut original data files

recipes.csv and reviews.csv need to be cut down to under 100MB in order to upload in GitHub

Unnecessary fields will be cut

Publication dates before 2013 will be cut so only 10 years of data will remain and reviews will represent a larger portion of the population and computers, smartphones, and the internet became more ubiquitous

In [20]:
# import dependencies
import pandas as pd
import datetime as dt
import numpy as np

# Reviews Data

In [2]:
# Import reviews csv
reviews_raw = pd.read_csv("../og_files/reviews.csv")
reviews_raw.head(5)

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25T21:44:00Z,2000-01-25T21:44:00Z
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17T16:49:59Z,2001-10-17T16:49:59Z
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25T09:00:00Z,2000-02-25T09:00:00Z
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13T21:15:00Z,2000-03-13T21:15:00Z
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28T12:51:00Z,2000-03-28T12:51:00Z


In [4]:
# Check datatypes; Want DateSubmitted to be in datetime
reviews_raw.dtypes

ReviewId          int64
RecipeId          int64
AuthorId          int64
AuthorName       object
Rating            int64
Review           object
DateSubmitted    object
DateModified     object
dtype: object

In [23]:
# DateSubmitted is a string object and needs to be cast in datetime
reviews_raw = reviews_raw.astype({"DateSubmitted": "datetime64"})

In [28]:
# Cut the data to be 2013 or later so that there will only be 10 years of data
reviews_cut = reviews_raw.loc[(reviews_raw["DateSubmitted"]>np.datetime64('2013-01-01')),:]

In [29]:
# Export the cut reviews DataFrame as a CSV file. 
reviews_cut.to_csv("Resources/reviews_cut_2013.csv", encoding='utf8', index=False)

# Recipes Data

In [30]:
# Import recipes csv
recipes_raw = pd.read_csv("../og_files/recipes.csv")
recipes_raw.head(5)

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


In [37]:
recipes_raw.columns

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')

In [38]:
# Cut out unnecessary columns: "Images", "RecipeInstructions", "RecipeYield", "RecipeInstructions", "Keywords", 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'Description'
recipes_cut = recipes_raw.drop(columns=["Images", "AuthorName", "RecipeYield", "RecipeInstructions", "Keywords", 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'Description'])
recipes_cut.head(5)

Unnamed: 0,RecipeId,Name,AuthorId,CookTime,PrepTime,TotalTime,DatePublished,RecipeCategory,AggregatedRating,ReviewCount,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings
0,38,Low-Fat Berry Blue Frozen Dessert,1533,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Frozen Desserts,4.5,4.0,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0
1,39,Biryani,1567,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Chicken Breast,3.0,1.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0
2,40,Best Lemonade,1566,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,Beverages,4.5,10.0,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0
3,41,Carina's Tofu-Vegetable Kebabs,1586,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,Soy/Tofu,4.5,2.0,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0
4,42,Cabbage Soup,1538,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Vegetable,4.5,11.0,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0


In [40]:
# Check types of columns. We want datepublished to be in datetime64
recipes_cut.dtypes

RecipeId                        int64
Name                           object
AuthorId                        int64
CookTime                       object
PrepTime                       object
TotalTime                      object
DatePublished          datetime64[ns]
RecipeCategory                 object
AggregatedRating              float64
ReviewCount                   float64
Calories                      float64
FatContent                    float64
SaturatedFatContent           float64
CholesterolContent            float64
SodiumContent                 float64
CarbohydrateContent           float64
FiberContent                  float64
SugarContent                  float64
ProteinContent                float64
RecipeServings                float64
dtype: object

In [39]:
# Check types of columns. We want datepublished to be in datetime64
recipes_cut= recipes_cut.astype({"DatePublished": "datetime64"})
recipes_cut.dtypes

RecipeId                        int64
Name                           object
AuthorId                        int64
CookTime                       object
PrepTime                       object
TotalTime                      object
DatePublished          datetime64[ns]
RecipeCategory                 object
AggregatedRating              float64
ReviewCount                   float64
Calories                      float64
FatContent                    float64
SaturatedFatContent           float64
CholesterolContent            float64
SodiumContent                 float64
CarbohydrateContent           float64
FiberContent                  float64
SugarContent                  float64
ProteinContent                float64
RecipeServings                float64
dtype: object

In [41]:
# Cut the data to be 2013 or later so that there will only be 10 years of data
recipes_cut_2 = recipes_cut.loc[(recipes_cut["DatePublished"]>np.datetime64('2013-01-01')),:]

In [42]:
# Export the cut reviews DataFrame as a CSV file. 
recipes_cut_2.to_csv("Resources/recipes_cut_2013.csv", encoding='utf8', index=False)

# Recipes Keywords and Ingredients

In [50]:
recipes_ingrediants = recipes_raw.loc[(recipes_cut["DatePublished"]>np.datetime64('2013-01-01')), ['RecipeId', 'Name', 'TotalTime', 'DatePublished', 'RecipeCategory', 'Keywords', 'RecipeIngredientParts', 'AggregatedRating', 'ReviewCount']]

In [51]:
recipes_ingrediants

Unnamed: 0,RecipeId,Name,TotalTime,DatePublished,RecipeCategory,Keywords,RecipeIngredientParts,AggregatedRating,ReviewCount
127958,134437,Copycat Wendy&rsquo;s Spicy Chicken Fillet San...,PT50M,2019-02-22T18:59:00Z,Lunch/Snacks,"c(""Chicken"", ""Poultry"", ""Meat"", ""< 60 Mins"")","c(""chicken breasts"", ""buttermilk"", ""all-purpos...",4.0,3.0
212104,221240,Copycat Red Lobster Nantucket Baked Cod,PT16M,2019-02-22T18:53:00Z,Very Low Carbs,"""< 30 Mins""","c(""olive oil"", ""cod fish fillets"", ""lemon juic...",,1.0
240475,250325,Copycat Jimmy Dean Breakfast Sausage,PT3H8M,2019-02-22T19:02:00Z,Breakfast,"c(""Pork"", ""Meat"", ""< 4 Hours"")","c(""ground pork"", ""brown sugar"", ""light corn sy...",1.5,5.0
433189,449227,Copycat Panera Bread Spinach &amp; Artichoke S...,PT50M,2019-02-22T18:57:00Z,Breakfast,"""< 60 Mins""","c(""unsalted butter"", ""all-purpose flour"", ""sal...",5.0,1.0
475190,492718,Tamale Hash,PT13M,2013-01-01T06:30:00Z,Breakfast,"c(""Mexican"", ""< 15 Mins"", ""Easy"")","c(""onion"", ""garlic clove"", ""salsa"", ""Black-Eye...",5.0,2.0
...,...,...,...,...,...,...,...,...,...
522512,541379,Meg's Fresh Ginger Gingerbread,PT1H35M,2020-12-22T15:27:00Z,Dessert,"""< 4 Hours""","c(""fresh ginger"", ""unsalted butter"", ""dark bro...",,
522513,541380,Roast Prime Rib au Poivre with Mixed Peppercorns,PT3H30M,2020-12-22T15:32:00Z,Very Low Carbs,"c(""High Protein"", ""High In..."", ""< 4 Hours"")","c(""Dijon mustard"", ""garlic"", ""peppercorns"", ""s...",,
522514,541381,Kirshwasser Ice Cream,PT4H,2020-12-22T15:33:00Z,Ice Cream,"c(""Dessert"", ""< 4 Hours"")","c(""half-and-half"", ""heavy cream"", ""brandy"", ""s...",,
522515,541382,Quick & Easy Asian Cucumber Salmon Rolls,PT15M,2020-12-22T22:11:00Z,Canadian,"c(""< 15 Mins"", ""Easy"")","c(""wasabi paste"", ""dill"", ""English cucumber"", ...",,


In [49]:
# Export the cut recipes DataFrame as a CSV file. 
recipes_ingrediants.to_csv("Resources/recipes_ingredients_2013.csv", encoding='utf8', index=False) 