data sources:

- https://fdc.nal.usda.gov/download-datasets.html
- https://www.ers.usda.gov/data-products/fruit-and-vegetable-prices.aspx

- https://www.kaggle.com/datasets/trolukovich/nutritional-values-for-common-foods-and-products

explored, but not used
- https://www.kaggle.com/datasets/thedevastator/the-nutritional-content-of-food-a-comprehensive
- https://www.kaggle.com/datasets/thunderz/food-data


inspiration sources:

- https://nutritionj.biomedcentral.com/articles/10.1186/s12937-019-0496-5
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2954450/
- https://www.healthline.com/nutrition/29-cheap-healthy-foods
- https://nutritionfacts.org/blog/best-nutrition-bang-for-your-buck/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# data source:
# https://www.ers.usda.gov/data-products/fruit-and-vegetable-prices.aspx

fruit = pd.read_csv('data/prices/Fruit Prices 2020.csv')

In [3]:
fruit.head()

Unnamed: 0,Fruit,Form,RetailPrice,RetailPriceUnit,Yield,CupEquivalentSize,CupEquivalentUnit,CupEquivalentPrice
0,Apples,Fresh,1.5193,per pound,0.9,0.2425,pounds,0.4094
1,"Apples, applesauce",Canned,1.066,per pound,1.0,0.5401,pounds,0.5758
2,"Apples, ready-to-drink",Juice,0.7804,per pint,1.0,8.0,fluid ounces,0.3902
3,"Apples, frozen concentrate",Juice,0.5853,per pint,1.0,8.0,fluid ounces,0.2926
4,Apricots,Fresh,2.9665,per pound,0.93,0.3638,pounds,1.1603


In [4]:
fruit.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Fruit               62 non-null     object 
 1   Form                62 non-null     object 
 2   RetailPrice         62 non-null     float64
 3   RetailPriceUnit     62 non-null     object 
 4   Yield               62 non-null     float64
 5   CupEquivalentSize   62 non-null     float64
 6   CupEquivalentUnit   62 non-null     object 
 7   CupEquivalentPrice  62 non-null     float64
dtypes: float64(4), object(4)
memory usage: 18.1 KB


In [5]:
veg = pd.read_csv('data/prices/Vegetable Prices 2020.csv')

In [6]:
veg.head()

Unnamed: 0,Vegetable,Form,RetailPrice,RetailPriceUnit,Yield,CupEquivalentSize,CupEquivalentUnit,CupEquivalentPrice
0,Acorn squash,Fresh,1.1804,per pound,0.4586,0.4519,pounds,1.1633
1,Artichoke,Fresh,2.1913,per pound,0.375,0.3858,pounds,2.2545
2,Artichoke,Canned,3.4119,per pound,0.65,0.3858,pounds,2.0251
3,Asparagus,Fresh,2.7576,per pound,0.4938,0.3968,pounds,2.2159
4,Asparagus,Canned,3.1269,per pound,0.65,0.3968,pounds,1.909


In [7]:
veg.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Vegetable           93 non-null     object 
 1   Form                93 non-null     object 
 2   RetailPrice         93 non-null     float64
 3   RetailPriceUnit     93 non-null     object 
 4   Yield               93 non-null     float64
 5   CupEquivalentSize   93 non-null     float64
 6   CupEquivalentUnit   93 non-null     object 
 7   CupEquivalentPrice  93 non-null     float64
dtypes: float64(4), object(4)
memory usage: 26.8 KB


In [8]:
fruit.rename(columns={ fruit.columns[0]: "Item" }, inplace = True)
veg.rename(columns={ veg.columns[0]: "Item" }, inplace = True)
prices = pd.concat([fruit, veg])
prices

Unnamed: 0,Item,Form,RetailPrice,RetailPriceUnit,Yield,CupEquivalentSize,CupEquivalentUnit,CupEquivalentPrice
0,Apples,Fresh,1.5193,per pound,0.9000,0.2425,pounds,0.4094
1,"Apples, applesauce",Canned,1.0660,per pound,1.0000,0.5401,pounds,0.5758
2,"Apples, ready-to-drink",Juice,0.7804,per pint,1.0000,8.0000,fluid ounces,0.3902
3,"Apples, frozen concentrate",Juice,0.5853,per pint,1.0000,8.0000,fluid ounces,0.2926
4,Apricots,Fresh,2.9665,per pound,0.9300,0.3638,pounds,1.1603
...,...,...,...,...,...,...,...,...
88,Tomatoes,Canned,1.0175,per pound,1.0000,0.5401,pounds,0.5496
89,Turnip greens,Fresh,2.4176,per pound,0.7500,0.3197,pounds,1.0304
90,Turnip greens,Canned,1.0429,per pound,0.6500,0.3527,pounds,0.5660
91,Turnip greens,Frozen,1.9451,per pound,0.7760,0.3527,pounds,0.8841


In [9]:
prices['RetailPriceUnit'].value_counts()

per pound    144
per pint      11
Name: RetailPriceUnit, dtype: int64

In [10]:
prices['CupEquivalentUnit'].value_counts()

pounds          144
fluid ounces     11
Name: CupEquivalentUnit, dtype: int64

In [11]:
prices['Form'].value_counts()

Fresh     66
Canned    36
Frozen    25
Dried     17
Juice     11
Name: Form, dtype: int64

In [12]:
# only juice prices are per pint; everything else is per pound--easily convertable to grams
prices.groupby(['RetailPriceUnit','Form'])['Item'].count()

RetailPriceUnit  Form  
per pint         Juice     11
per pound        Canned    36
                 Dried     17
                 Fresh     66
                 Frozen    25
Name: Item, dtype: int64

In [13]:
# dropping unneeded fields and rows
prices = prices[['Item','Form', 'RetailPrice', 'RetailPriceUnit']]
prices = prices[~(prices['Form'] == 'Juice')]

In [14]:
gramsPerPound = 453.5924

In [15]:
prices['PricePer100Grams'] = prices['RetailPrice']/gramsPerPound*100
prices

Unnamed: 0,Item,Form,RetailPrice,RetailPriceUnit,PricePer100Grams
0,Apples,Fresh,1.5193,per pound,0.334948
1,"Apples, applesauce",Canned,1.0660,per pound,0.235013
4,Apricots,Fresh,2.9665,per pound,0.654001
5,"Apricots, packed in juice",Canned,1.6905,per pound,0.372691
6,"Apricots, packed in syrup or water",Canned,2.0600,per pound,0.454152
...,...,...,...,...,...
88,Tomatoes,Canned,1.0175,per pound,0.224320
89,Turnip greens,Fresh,2.4176,per pound,0.532990
90,Turnip greens,Canned,1.0429,per pound,0.229920
91,Turnip greens,Frozen,1.9451,per pound,0.428821


In [17]:
prices.to_csv('prices.csv', index=False)