# Milestone 2

In this notebook we will preprocess the datasets and filter only needed information

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Total argiculture emission
First we will focus on `Emissions_Agriculture_Agriculture_total_E_All_Data_(Norm)` dataset and try to see what prouct emits the most of GHG (in term of CO2eq)

In [2]:
agriculture_emissions = pd.read_csv('../data/Emissions_Agriculture_Agriculture_total_E_All_Data_(Norm).csv', encoding='latin-1')
print(agriculture_emissions.shape)
agriculture_emissions.head()

(326182, 11)


Unnamed: 0,Country Code,Country,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,Afghanistan,5058,Enteric Fermentation,7231,Emissions (CO2eq),1961,1961,Gigagrams,5054.3459,A
1,2,Afghanistan,5058,Enteric Fermentation,7231,Emissions (CO2eq),1962,1962,Gigagrams,5151.5228,A
2,2,Afghanistan,5058,Enteric Fermentation,7231,Emissions (CO2eq),1963,1963,Gigagrams,5372.3989,A
3,2,Afghanistan,5058,Enteric Fermentation,7231,Emissions (CO2eq),1964,1964,Gigagrams,5440.365,A
4,2,Afghanistan,5058,Enteric Fermentation,7231,Emissions (CO2eq),1965,1965,Gigagrams,5577.558,A


In [3]:
# Filter only Emmisions (CO2eq)
agriculture_emissions = agriculture_emissions[agriculture_emissions.Element == 'Emissions (CO2eq)']
# We can get rid of all codes + Unit + Element
agriculture_emissions = agriculture_emissions.drop(columns=['Country Code', 'Item Code', 'Element Code', 'Year Code', 'Unit', 'Element'])

In [4]:
# Filter only Emmisions (CO2eq)
print(agriculture_emissions.shape)
agriculture_emissions.head()

(139965, 5)


Unnamed: 0,Country,Item,Year,Value,Flag
0,Afghanistan,Enteric Fermentation,1961,5054.3459,A
1,Afghanistan,Enteric Fermentation,1962,5151.5228,A
2,Afghanistan,Enteric Fermentation,1963,5372.3989,A
3,Afghanistan,Enteric Fermentation,1964,5440.365,A
4,Afghanistan,Enteric Fermentation,1965,5577.558,A


In [5]:
# Flags
agriculture_emissions.Flag.value_counts()

A     123645
Fc      8919
EA      7401
Name: Flag, dtype: int64

The flags represent : 
* `A` : Aggregate, may include official, semi-official, estimated or calculated data
* `Fc` : Calculated data
* `EA` : Emissions computed using data from Fertilizer Archive dataset

All these flag are acceptable we can then drop this column

In [6]:
agriculture_emissions = agriculture_emissions.drop(columns='Flag')

In [7]:
# Look for NaN
agriculture_emissions.isnull().sum()

Country    0
Item       0
Year       0
Value      0
dtype: int64

In [8]:
# Remove aggregation item
agg_item = ['Agriculture total', 'Agricultural Soils']
agriculture_emissions = agriculture_emissions.drop(agriculture_emissions[agriculture_emissions['Item'].isin(agg_item)].index)

# Remove aggregation country
agriculture_emissions.Country.unique()
agg_country = ['World', 'Africa',
       'Eastern Africa', 'Middle Africa', 'Northern Africa',
       'Southern Africa', 'Western Africa', 'Americas',
       'Northern America', 'Central America', 'Caribbean',
       'South America', 'Asia', 'Central Asia', 'Eastern Asia',
       'Southern Asia', 'South-Eastern Asia', 'Western Asia', 'Europe',
       'Eastern Europe', 'Northern Europe', 'Southern Europe',
       'Western Europe', 'Oceania', 'Australia & New Zealand',
       'Melanesia', 'Micronesia', 'Polynesia', 'European Union',
       'Least Developed Countries', 'Land Locked Developing Countries',
       'Small Island Developing States',
       'Low Income Food Deficit Countries',
       'Net Food Importing Developing Countries', 'Annex I countries',
       'Non-Annex I countries', 'OECD']
agriculture_emissions = agriculture_emissions.drop(agriculture_emissions[agriculture_emissions['Country'].isin(agg_country)].index)

There is no missing values

In [9]:
# Pivot the table to make element columns
agriculture_emissions = pd.pivot_table(agriculture_emissions, values='Value', index=['Country', 'Year'], columns='Item')
agriculture_emissions.head()

Unnamed: 0_level_0,Item,Burning - Crop residues,Burning - Savanna,Crop Residues,Cultivation of Organic Soils,Enteric Fermentation,Manure Management,Manure applied to Soils,Manure left on Pasture,Rice Cultivation,Synthetic Fertilizers
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,1961,118.2797,,332.7512,,5054.3459,367.831,390.4421,2368.1734,617.4,6.4546
Afghanistan,1962,121.7607,,335.2904,,5151.5228,376.4362,396.0043,2326.1498,617.4,6.4546
Afghanistan,1963,121.7607,,312.7451,,5372.3989,392.5811,406.7487,2390.2709,617.4,6.4546
Afghanistan,1964,122.7093,,337.2814,,5440.365,399.8526,415.1598,2422.8848,646.8,6.4546
Afghanistan,1965,122.3902,,340.9305,,5577.558,413.3914,427.7566,2482.0286,646.8,6.4546


In [10]:
# Here we assume that if there is no data the country just do not produce any emission of that type
agriculture_emissions.fillna(0, inplace=True)

In [11]:
# Create a new column sum
agriculture_emissions['Sum'] = agriculture_emissions.sum(axis=1)
agriculture_emissions.head()

Unnamed: 0_level_0,Item,Burning - Crop residues,Burning - Savanna,Crop Residues,Cultivation of Organic Soils,Enteric Fermentation,Manure Management,Manure applied to Soils,Manure left on Pasture,Rice Cultivation,Synthetic Fertilizers,Sum
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,1961,118.2797,0.0,332.7512,0.0,5054.3459,367.831,390.4421,2368.1734,617.4,6.4546,9255.6779
Afghanistan,1962,121.7607,0.0,335.2904,0.0,5151.5228,376.4362,396.0043,2326.1498,617.4,6.4546,9331.0188
Afghanistan,1963,121.7607,0.0,312.7451,0.0,5372.3989,392.5811,406.7487,2390.2709,617.4,6.4546,9620.36
Afghanistan,1964,122.7093,0.0,337.2814,0.0,5440.365,399.8526,415.1598,2422.8848,646.8,6.4546,9791.5075
Afghanistan,1965,122.3902,0.0,340.9305,0.0,5577.558,413.3914,427.7566,2482.0286,646.8,6.4546,10017.3099


In [14]:
# create a new dataframe with relative numbers
relative_agriculture_emission = pd.DataFrame()
for col in agriculture_emissions.columns:
    new_col = '% '+col
    relative_agriculture_emission[new_col] = agriculture_emissions[col]/agriculture_emissions['Sum'] * 100
    
relative_agriculture_emission.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,% Burning - Crop residues,% Burning - Savanna,% Crop Residues,% Cultivation of Organic Soils,% Enteric Fermentation,% Manure Management,% Manure applied to Soils,% Manure left on Pasture,% Rice Cultivation,% Synthetic Fertilizers,% Sum
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,1961,1.277915,0.0,3.595103,0.0,54.608057,3.974112,4.218406,25.586169,6.6705,0.069737,100.0
Afghanistan,1962,1.304903,0.0,3.593288,0.0,55.208578,4.034245,4.243956,24.929216,6.616641,0.069174,100.0
Afghanistan,1963,1.265656,0.0,3.250867,0.0,55.844053,4.080732,4.227999,24.845961,6.417639,0.067093,100.0
Afghanistan,1964,1.253222,0.0,3.444632,0.0,55.562078,4.083667,4.239999,24.744758,6.605724,0.06592,100.0
Afghanistan,1965,1.221787,0.0,3.403414,0.0,55.6792,4.126771,4.270174,24.777397,6.456823,0.064434,100.0


In [15]:
# Save the datasets
agriculture_emissions.to_csv('../generated/agriculture_emissions_total.csv')
relative_agriculture_emission.to_csv('../generated/agriculture_emissions_total_relative.csv')