In [1]:
import numpy as np
import pandas as pd

import sys, os

# Helpers
abspath = os.path.abspath
dirname = os.path.dirname
sep = os.sep

# Update sys.path for in-house libraries
folder_ = dirname(abspath(os.getcwd()))
for i in range(1): folder_ = dirname(folder_)
sys.path.append(folder_)

# In-house libraries
import src.utils.mining_data_tb as md
import src.utils.folder_tb as fo

In [2]:
# Path to data
production_data_path = fo.path_to_folder(2, "data" + sep + "environment")
# Load data
production_df = pd.read_csv(production_data_path + "food_production.csv")
production_df.head(2)

Unnamed: 0,Food product,Land use change,Animal Feed,Farm,Processing,Transport,Packging,Retail,Total_emissions,Eutrophying emissions per 1000kcal (gPO₄eq per 1000kcal),...,Freshwater withdrawals per 100g protein (liters per 100g protein),Freshwater withdrawals per kilogram (liters per kilogram),Greenhouse gas emissions per 1000kcal (kgCO₂eq per 1000kcal),Greenhouse gas emissions per 100g protein (kgCO₂eq per 100g protein),Land use per 1000kcal (m² per 1000kcal),Land use per kilogram (m² per kilogram),Land use per 100g protein (m² per 100g protein),Scarcity-weighted water use per kilogram (liters per kilogram),Scarcity-weighted water use per 100g protein (liters per 100g protein),Scarcity-weighted water use per 1000kcal (liters per 1000 kilocalories)
0,Wheat & Rye (Bread),0.1,0.0,0.8,0.2,0.1,0.1,0.1,1.4,,...,,,,,,,,,,
1,Maize (Meal),0.3,0.0,0.5,0.1,0.1,0.1,0.0,1.1,,...,,,,,,,,,,


In [3]:
production_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 23 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   Food product                                                             43 non-null     object 
 1   Land use change                                                          43 non-null     float64
 2   Animal Feed                                                              43 non-null     float64
 3   Farm                                                                     43 non-null     float64
 4   Processing                                                               43 non-null     float64
 5   Transport                                                                43 non-null     float64
 6   Packging                                                                 43 

In [4]:
# As we have some missing values in the data, we will try to get this information from other files

In [5]:
# The units are liters (l): https://ourworldindata.org/environmental-impacts-of-food?country=#water-use
resources_data_path = production_data_path + "resources_use" + sep

# Load data and drop the unnecessary columns
land_use_kcal = pd.read_csv(resources_data_path + "land-use-kcal-poore.csv").drop(["Code", "Year"], axis = 1)
land_use_kg = pd.read_csv(resources_data_path + "land-use-per-kg-poore.csv").drop(["Code", "Year"], axis = 1)
land_use_protein = pd.read_csv(resources_data_path + "land-use-protein-poore.csv").drop(["Code", "Year"], axis = 1)

# Merge all the data in one dataframe
land_use = pd.merge(land_use_kcal, land_use_kg, how = "outer", on = "Entity")
land_use = pd.merge(land_use, land_use_protein, how = "outer", on = "Entity")
land_use.columns = ["Entity", "Land use per 1000kcal", "Land use per kg", "Land use per 100g protein"]
land_use.head()

Unnamed: 0,Entity,Land use per 1000kcal,Land use per kg,Land use per 100g protein
0,Apples,1.3125,0.63,21.0
1,Bananas,3.216667,1.93,21.444444
2,Barley,0.222,1.11,
3,Beef (beef herd),119.490842,326.21,163.595787
4,Beef (dairy herd),15.838828,43.24,21.904762


In [6]:
# The units are squared meters (m2): https://ourworldindata.org/environmental-impacts-of-food?country=#water-use

# Load data and drop the unnecessary columns
# We use the same path as before
water_use_kcal = pd.read_csv(resources_data_path + "freshwater-withdrawals-per-kcal.csv").drop(["Code", "Year"], axis = 1)
water_use_kg = pd.read_csv(resources_data_path + "freshwater-withdrawals-per-kg.csv").drop(["Code", "Year"], axis = 1)
water_use_protein = pd.read_csv(resources_data_path + "freshwater-withdrawals-per-protein.csv").drop(["Code", "Year"], axis = 1)

# Merge all the data in one dataframe
water_use = pd.merge(water_use_kcal, water_use_kg, how = "outer", on = "Entity")
water_use = pd.merge(water_use, water_use_protein, how = "outer", on = "Entity")
water_use.columns = ["Entity", "Freswater withdrawls per 1000kcal", "Freswater withdrawls per kg", "Freswater withdrawls per 100g protein"]
water_use.head()

Unnamed: 0,Entity,Freswater withdrawls per 1000kcal,Freswater withdrawls per kg,Freswater withdrawls per 100g protein
0,Apples,375.208333,180.1,6003.333333
1,Bananas,190.833333,114.5,1272.222222
2,Barley,3.42,17.1,
3,Beef (beef herd),531.575092,1451.2,727.78335
4,Beef (dairy herd),994.249084,2714.3,1375.025329


In [9]:
# We clean up a little bit the main df before merging
production_df = production_df.rename({"Food product" : "Entity", "Total_emissions" : "Total emissions"}, axis = 1)

# Now we merge everything together
resources = pd.merge(production_df, land_use, how = "outer", on = "Entity")
resources = pd.merge(resources, water_use, how = "outer", on = "Entity")

# Some cleaning and transformation
resources = resources.rename({"Entity" : "Food"}, axis = 1)
resources = resources.set_index("Food")
resources = resources.loc[:, ["Total emissions", 'Land use per 1000kcal', 'Land use per kg', 'Land use per 100g protein', 'Freswater withdrawls per 1000kcal', 'Freswater withdrawls per kg', 'Freswater withdrawls per 100g protein']]

resources.head()

Unnamed: 0_level_0,Total emissions,Land use per 1000kcal,Land use per kg,Land use per 100g protein,Freswater withdrawls per 1000kcal,Freswater withdrawls per kg,Freswater withdrawls per 100g protein
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Wheat & Rye (Bread),1.4,,,,,,
Maize (Meal),1.1,,,,,,
Barley (Beer),1.1,,,,,,
Oatmeal,1.6,2.897446,7.6,5.846154,183.911552,482.4,371.076923
Rice,4.0,0.759631,2.8,3.943662,609.983722,2248.4,3166.760563


Use combine_data function from EDA mining_data file

resources.index
plant_based = ['Wheat & Rye (Bread)', 'Maize (Meal)', 'Barley (Beer)', 'Oatmeal',
       'Rice', 'Potatoes', 'Cassava', 'Cane Sugar', 'Beet Sugar',
       'Other Pulses', 'Peas', 'Nuts', 'Groundnuts', 'Soymilk', 'Tofu',
       'Soybean Oil', 'Palm Oil', 'Sunflower Oil', 'Rapeseed Oil', 'Olive Oil',
       'Tomatoes', 'Onions & Leeks', 'Root Vegetables', 'Brassicas',
       'Other Vegetables', 'Citrus Fruit', 'Bananas', 'Apples',
       'Berries & Grapes', 'Wine', 'Other Fruit', 'Coffee', 'Dark Chocolate']

animal_based = ['Beef (beef herd)', 'Beef (dairy herd)', 'Lamb & Mutton', 'Pig Meat',
       'Poultry Meat', 'Milk', 'Cheese', 'Eggs', 'Fish (farmed)',
       'Shrimps (farmed)']

In [25]:
# Let's save the cleaned dataframe
resources.to_csv(production_data_path + "resources.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, Wheat & Rye (Bread) to Grains
Data columns (total 7 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Total emissions                        43 non-null     float64
 1   Land use per 1000kcal                  38 non-null     float64
 2   Land use per kg                        43 non-null     float64
 3   Land use per 100g protein              32 non-null     float64
 4   Freswater withdrawls per 1000kcal      35 non-null     float64
 5   Freswater withdrawls per kg            43 non-null     float64
 6   Freswater withdrawls per 100g protein  30 non-null     float64
dtypes: float64(7)
memory usage: 4.1+ KB
