In [1]:
import numpy as np
import pandas as pd

import sys, os

# Helpers
abspath = os.path.abspath
dirname = os.path.dirname
sep = os.sep

# Update sys.path for in-house libraries
folder_ = dirname(abspath(os.getcwd()))
for i in range(1): folder_ = dirname(folder_)
sys.path.append(folder_)

# In-house libraries
import src.utils.mining_data_tb as md
import src.utils.folder_tb as fo

In [2]:
# Path to data
production_data_path = fo.path_to_folder(2, "data" + sep + "environment")
# Load data
production_df = pd.read_csv(production_data_path + "food_production.csv")
production_df.head(2)

Unnamed: 0,Food product,Land use change,Animal Feed,Farm,Processing,Transport,Packging,Retail,Total_emissions,Eutrophying emissions per 1000kcal (gPO₄eq per 1000kcal),...,Freshwater withdrawals per 100g protein (liters per 100g protein),Freshwater withdrawals per kilogram (liters per kilogram),Greenhouse gas emissions per 1000kcal (kgCO₂eq per 1000kcal),Greenhouse gas emissions per 100g protein (kgCO₂eq per 100g protein),Land use per 1000kcal (m² per 1000kcal),Land use per kilogram (m² per kilogram),Land use per 100g protein (m² per 100g protein),Scarcity-weighted water use per kilogram (liters per kilogram),Scarcity-weighted water use per 100g protein (liters per 100g protein),Scarcity-weighted water use per 1000kcal (liters per 1000 kilocalories)
0,Wheat & Rye (Bread),0.1,0.0,0.8,0.2,0.1,0.1,0.1,1.4,,...,,,,,,,,,,
1,Maize (Meal),0.3,0.0,0.5,0.1,0.1,0.1,0.0,1.1,,...,,,,,,,,,,


In [3]:
production_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 23 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   Food product                                                             43 non-null     object 
 1   Land use change                                                          43 non-null     float64
 2   Animal Feed                                                              43 non-null     float64
 3   Farm                                                                     43 non-null     float64
 4   Processing                                                               43 non-null     float64
 5   Transport                                                                43 non-null     float64
 6   Packging                                                                 43 

In [4]:
# As we have some missing values in the data, we will try to get this information from other files

In [5]:
# The units are liters (l): https://ourworldindata.org/environmental-impacts-of-food?country=#water-use
resources_data_path = production_data_path + "resources_use" + sep

# Load data and drop the unnecessary columns
land_use_kcal = pd.read_csv(resources_data_path + "land-use-kcal-poore.csv").drop(["Code", "Year"], axis = 1)
land_use_kg = pd.read_csv(resources_data_path + "land-use-per-kg-poore.csv").drop(["Code", "Year"], axis = 1)
land_use_protein = pd.read_csv(resources_data_path + "land-use-protein-poore.csv").drop(["Code", "Year"], axis = 1)

# Merge all the data in one dataframe
land_use = pd.merge(land_use_kcal, land_use_kg, how = "outer", on = "Entity")
land_use = pd.merge(land_use, land_use_protein, how = "outer", on = "Entity")
land_use.columns = ["Entity", "Land use per 1000kcal", "Land use per kg", "Land use per 100g protein"]
land_use.head()

Unnamed: 0,Entity,Land use per 1000kcal,Land use per kg,Land use per 100g protein
0,Apples,1.3125,0.63,21.0
1,Bananas,3.216667,1.93,21.444444
2,Barley,0.222,1.11,
3,Beef (beef herd),119.490842,326.21,163.595787
4,Beef (dairy herd),15.838828,43.24,21.904762


In [6]:
# The units are squared meters (m2): https://ourworldindata.org/environmental-impacts-of-food?country=#water-use

# Load data and drop the unnecessary columns
# We use the same path as before
water_use_kcal = pd.read_csv(resources_data_path + "freshwater-withdrawals-per-kcal.csv").drop(["Code", "Year"], axis = 1)
water_use_kg = pd.read_csv(resources_data_path + "freshwater-withdrawals-per-kg.csv").drop(["Code", "Year"], axis = 1)
water_use_protein = pd.read_csv(resources_data_path + "freshwater-withdrawals-per-protein.csv").drop(["Code", "Year"], axis = 1)

# Merge all the data in one dataframe
water_use = pd.merge(water_use_kcal, water_use_kg, how = "outer", on = "Entity")
water_use = pd.merge(water_use, water_use_protein, how = "outer", on = "Entity")
water_use.columns = ["Entity", "Freswater withdrawls per 1000kcal", "Freswater withdrawls per kg", "Freswater withdrawls per 100g protein"]
water_use.head()

Unnamed: 0,Entity,Freswater withdrawls per 1000kcal,Freswater withdrawls per kg,Freswater withdrawls per 100g protein
0,Apples,375.208333,180.1,6003.333333
1,Bananas,190.833333,114.5,1272.222222
2,Barley,3.42,17.1,
3,Beef (beef herd),531.575092,1451.2,727.78335
4,Beef (dairy herd),994.249084,2714.3,1375.025329


In [7]:
# We clean up a little bit the main df before merging
production_df = production_df.rename({"Food product" : "Entity", "Total_emissions" : "Total emissions"}, axis = 1)

# Now we merge everything together
resources = pd.merge(production_df, land_use, how = "outer", on = "Entity")
resources = pd.merge(resources, water_use, how = "outer", on = "Entity")

# Some cleaning and transformation
resources = resources.rename({"Entity" : "Food"}, axis = 1)
resources = resources.set_index("Food")
resources = resources.loc[:, ["Total emissions", 'Land use per 1000kcal', 'Land use per kg', 'Land use per 100g protein', 'Freswater withdrawls per 1000kcal', 'Freswater withdrawls per kg', 'Freswater withdrawls per 100g protein']]

resources.head()

Unnamed: 0_level_0,Total emissions,Land use per 1000kcal,Land use per kg,Land use per 100g protein,Freswater withdrawls per 1000kcal,Freswater withdrawls per kg,Freswater withdrawls per 100g protein
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Wheat & Rye (Bread),1.4,,,,,,
Maize (Meal),1.1,,,,,,
Barley (Beer),1.1,,,,,,
Oatmeal,1.6,2.897446,7.6,5.846154,183.911552,482.4,371.076923
Rice,4.0,0.759631,2.8,3.943662,609.983722,2248.4,3166.760563


In [8]:
def combine_data(column1, column2, df):
    '''
    This function combines two foods' values in the resources data. For instancem "Tofu" and "Tofu (soybeans)", as they are the same food, and one has the missing values of the other.

    args :
    column1 -> Should be the name of the food1 in the dataframe
    column2 -> Should be the name of the food2 in the dataframe
    df -> dataframe we pull the data from according to the given names
    '''
    # To store the new values of combining both columns
    new_values = []

    # Iterate through the length of the column1 (both columns should have the same length)
    for i in range(len(df.loc[column1])):
        # If column1 is nan, return the value of the other column
        if np.isnan(df.loc[column1][i]):
            new_values.append(df.loc[column2][i])
        # else, keep the one from column 1
        else:
            new_values.append(df.loc[column1][i])

    # Join the values together with an index (should be the same for both columns)
    # and transpose it
    df = pd.DataFrame(new_values, index = df.loc[column1].index, columns = [column1 + "_"])
    return df.T

In [9]:
tofu = combine_data("Tofu", "Tofu (soybeans)", resources)
wheat = combine_data("Wheat & Rye", "Wheat & Rye (Bread)", resources)
maize = combine_data("Maize", "Maize (Meal)", resources)
barley = combine_data("Barley", "Barley (Beer)", resources)

resources = resources.append([tofu, wheat, maize, barley])
resources = resources.drop(["Tofu", "Tofu (soybeans)",
            "Wheat & Rye", "Wheat & Rye (Bread)",
            "Maize", "Maize (Meal)",
            "Barley", "Barley (Beer)",])

resources = resources.rename({'Tofu_': "Tofu", 'Wheat & Rye_': "Wheat & Rye", 'Maize_': "Maize",
       'Barley_': "Barley"})
resources.index

Index(['Oatmeal', 'Rice', 'Potatoes', 'Cassava', 'Cane Sugar', 'Beet Sugar',
       'Other Pulses', 'Peas', 'Nuts', 'Groundnuts', 'Soymilk', 'Soybean Oil',
       'Palm Oil', 'Sunflower Oil', 'Rapeseed Oil', 'Olive Oil', 'Tomatoes',
       'Onions & Leeks', 'Root Vegetables', 'Brassicas', 'Other Vegetables',
       'Citrus Fruit', 'Bananas', 'Apples', 'Berries & Grapes', 'Wine',
       'Other Fruit', 'Coffee', 'Dark Chocolate', 'Beef (beef herd)',
       'Beef (dairy herd)', 'Lamb & Mutton', 'Pig Meat', 'Poultry Meat',
       'Milk', 'Cheese', 'Eggs', 'Fish (farmed)', 'Shrimps (farmed)',
       'Prawns (farmed)', 'Grains', 'Tofu', 'Wheat & Rye', 'Maize', 'Barley'],
      dtype='object')

In [10]:
# Separate food products into two categories: plant-based and animal-based
plant_based = ['Oatmeal', 'Rice', 'Potatoes', 'Cassava', 'Cane Sugar', 'Beet Sugar',
       'Other Pulses', 'Peas', 'Nuts', 'Groundnuts', 'Soymilk', 'Soybean Oil',
       'Palm Oil', 'Sunflower Oil', 'Rapeseed Oil', 'Olive Oil', 'Tomatoes',
       'Onions & Leeks', 'Root Vegetables', 'Brassicas', 'Other Vegetables',
       'Citrus Fruit', 'Bananas', 'Apples', 'Berries & Grapes', 'Wine',
       'Other Fruit', 'Coffee', 'Dark Chocolate', 'Tofu', 'Wheat & Rye', 'Maize', 'Barley']

animal_based = ['Beef (beef herd)', 'Beef (dairy herd)', 'Lamb & Mutton', 'Pig Meat',
       'Poultry Meat', 'Milk', 'Cheese', 'Eggs', 'Fish (farmed)',
       'Shrimps (farmed)']

# Create a new column: origin
resources["Origin"] = None

# Replace the Nones in the new "Origin" column for the corresponding values, depending on the procedence
resources.loc[plant_based, "Origin"] = "Plant-based"
resources.loc[animal_based, "Origin"] = "Animal-based"

In [11]:
resources

Unnamed: 0,Total emissions,Land use per 1000kcal,Land use per kg,Land use per 100g protein,Freswater withdrawls per 1000kcal,Freswater withdrawls per kg,Freswater withdrawls per 100g protein,Origin
Oatmeal,1.6,2.897446,7.6,5.846154,183.911552,482.4,371.076923,Plant-based
Rice,4.0,0.759631,2.8,3.943662,609.983722,2248.4,3166.760563,Plant-based
Potatoes,0.3,1.202186,0.88,5.176471,80.737705,59.1,347.647059,Plant-based
Cassava,0.9,1.858316,1.81,20.111111,,0.0,,Plant-based
Cane Sugar,2.6,0.581197,2.04,,176.666667,620.1,,Plant-based
Beet Sugar,1.4,0.521368,1.83,,62.022792,217.7,,Plant-based
Other Pulses,1.6,4.565982,15.57,7.272303,,435.7,203.503036,Plant-based
Peas,0.8,2.156069,7.46,3.357336,,396.6,178.487849,Plant-based
Nuts,0.2,2.107317,12.96,7.936314,672.162602,4133.8,2531.414574,Plant-based
Groundnuts,2.4,1.57069,9.11,3.479756,319.362069,1852.3,707.524828,Plant-based


In [12]:
# Let's save the cleaned dataframe
#resources.to_csv(production_data_path + "resources.csv")

In [22]:
def color_mapper(df):
    color_map = {}

    for ind, row in df.iterrows():
        if row["Origin"] == "Plant-based":
            color_map[ind] = "blue"
        else:
            color_map[ind] = "red"

    return color_map

In [23]:
color_mapper(resources)

{'Oatmeal': 'blue',
 'Rice': 'blue',
 'Potatoes': 'blue',
 'Cassava': 'blue',
 'Cane Sugar': 'blue',
 'Beet Sugar': 'blue',
 'Other Pulses': 'blue',
 'Peas': 'blue',
 'Nuts': 'blue',
 'Groundnuts': 'blue',
 'Soymilk': 'blue',
 'Soybean Oil': 'blue',
 'Palm Oil': 'blue',
 'Sunflower Oil': 'blue',
 'Rapeseed Oil': 'blue',
 'Olive Oil': 'blue',
 'Tomatoes': 'blue',
 'Onions & Leeks': 'blue',
 'Root Vegetables': 'blue',
 'Brassicas': 'blue',
 'Other Vegetables': 'blue',
 'Citrus Fruit': 'blue',
 'Bananas': 'blue',
 'Apples': 'blue',
 'Berries & Grapes': 'blue',
 'Wine': 'blue',
 'Other Fruit': 'blue',
 'Coffee': 'blue',
 'Dark Chocolate': 'blue',
 'Beef (beef herd)': 'red',
 'Beef (dairy herd)': 'red',
 'Lamb & Mutton': 'red',
 'Pig Meat': 'red',
 'Poultry Meat': 'red',
 'Milk': 'red',
 'Cheese': 'red',
 'Eggs': 'red',
 'Fish (farmed)': 'red',
 'Shrimps (farmed)': 'red',
 'Prawns (farmed)': 'red',
 'Grains': 'red',
 'Tofu': 'blue',
 'Wheat & Rye': 'blue',
 'Maize': 'blue',
 'Barley': 'blue