In [1]:
# required packages
import pandas as pd
import numpy as np

In [8]:
# Loading data
raw_data = pd.read_csv('/Users/gurjitsingh/Desktop/MS Data Science/MS_Project_Python/raw_datasets/Cropland_Nutrient_Balance/Cropland_nutrient_balance_All_Data_(Normalized)/Environment_Cropland_nutrient_budget_E_All_Data_(Normalized).csv')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,5087,Mineral fertilizers,7275,Cropland nitrogen,1961,1961,t,1000.0,E,
1,2,'004,Afghanistan,5087,Mineral fertilizers,7275,Cropland nitrogen,1962,1962,t,1000.0,E,
2,2,'004,Afghanistan,5087,Mineral fertilizers,7275,Cropland nitrogen,1963,1963,t,1000.0,E,
3,2,'004,Afghanistan,5087,Mineral fertilizers,7275,Cropland nitrogen,1964,1964,t,550.0,E,
4,2,'004,Afghanistan,5087,Mineral fertilizers,7275,Cropland nitrogen,1965,1965,t,550.0,E,


In [9]:
raw_data['Item'].unique()

array(['Mineral fertilizers', 'Manure applied to Soils',
       'Atmospheric Deposition', 'Crop Removal', 'Biological Fixation',
       'Seed', 'Leaching', 'Volatilisation', 'Input', 'Outputs',
       'Nutrient balance'], dtype=object)

In [10]:
raw_data['Element'].unique()

array(['Cropland nitrogen', 'Cropland nitrogen per unit area',
       'Cropland phosphorus', 'Cropland phosphorus per unit area',
       'Cropland potassium', 'Cropland potassium per unit area',
       'Cropland nitrogen use efficiency',
       'Cropland phosphorus use efficiency',
       'Cropland potassium use efficiency'], dtype=object)

This dataset has complex set of information regarding nurient inputs and losses 
from the cropland area. The characteristics have been sub-divided into smaller units.
For example, the nutrient inputs are given in several forms like 'Mineral fertilizers', 
'Manure applied to Soils', 'Atmospheric Deposition'. Whereas, the nutrient outputs 
are given in forms like 'Crop Removal', 'Seed', 'Leaching', 
'Volatilisation'. 

However, the most important item in the "Item" column that is relevant to our 
purpose is "Nutrient balance". It tracks the net nutrient gain/loss on cropland, 
either in total or per unit area. 

Nutrient Balance = (Total nutrient input - Total nutrient output)

So, we will filter the dataset to keep the data coressponding to "Nutrient Balance".

In [11]:
filtered_data = raw_data.loc[raw_data['Item']=='Nutrient balance']
filtered_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
2728,2,'004,Afghanistan,5081,Nutrient balance,7275,Cropland nitrogen,1961,1961,t,21205.8263,E,
2729,2,'004,Afghanistan,5081,Nutrient balance,7275,Cropland nitrogen,1962,1962,t,21129.5929,E,
2730,2,'004,Afghanistan,5081,Nutrient balance,7275,Cropland nitrogen,1963,1963,t,28253.8766,E,
2731,2,'004,Afghanistan,5081,Nutrient balance,7275,Cropland nitrogen,1964,1964,t,24993.8707,E,
2732,2,'004,Afghanistan,5081,Nutrient balance,7275,Cropland nitrogen,1965,1965,t,26293.3111,E,


In [12]:
filtered_data['Element'].unique()

array(['Cropland nitrogen', 'Cropland nitrogen per unit area',
       'Cropland phosphorus', 'Cropland phosphorus per unit area',
       'Cropland potassium', 'Cropland potassium per unit area',
       'Cropland nitrogen use efficiency',
       'Cropland phosphorus use efficiency',
       'Cropland potassium use efficiency'], dtype=object)

In [13]:
filtered_data['Element'].value_counts()

Element
Cropland nitrogen                     13181
Cropland nitrogen per unit area       13181
Cropland phosphorus                   13181
Cropland phosphorus per unit area     13181
Cropland potassium                    13181
Cropland potassium per unit area      13181
Cropland nitrogen use efficiency      13181
Cropland phosphorus use efficiency    13181
Cropland potassium use efficiency     13181
Name: count, dtype: int64

Now, the "Element" column has several categories. Out of the total nutrient balance 
and nutrient balance per unit area, we would consider just the later, i.e., 
nutrient balance per unit area. Another important elements are related to cropland 
nutrient use efficiency. It's a ratio of nutrient output over nutrient input.

So, we will further filter our dataset for desired data.

In [14]:
elements_to_keep = [
    'Cropland nitrogen per unit area',
    'Cropland phosphorus per unit area',
    'Cropland potassium per unit area',
    'Cropland nitrogen use efficiency',
    'Cropland phosphorus use efficiency',
    'Cropland potassium use efficiency'
    ]

filtered_data_2 = filtered_data.loc[filtered_data['Element'].isin(elements_to_keep)]
filtered_data_2.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
2790,2,'004,Afghanistan,5081,Nutrient balance,7276,Cropland nitrogen per unit area,1961,1961,kg/ha,2.7362,E,
2791,2,'004,Afghanistan,5081,Nutrient balance,7276,Cropland nitrogen per unit area,1962,1962,kg/ha,2.7089,E,
2792,2,'004,Afghanistan,5081,Nutrient balance,7276,Cropland nitrogen per unit area,1963,1963,kg/ha,3.5992,E,
2793,2,'004,Afghanistan,5081,Nutrient balance,7276,Cropland nitrogen per unit area,1964,1964,kg/ha,3.1618,E,
2794,2,'004,Afghanistan,5081,Nutrient balance,7276,Cropland nitrogen per unit area,1965,1965,kg/ha,3.3241,E,


In [15]:
filtered_data_2['Element'].unique()

array(['Cropland nitrogen per unit area',
       'Cropland phosphorus per unit area',
       'Cropland potassium per unit area',
       'Cropland nitrogen use efficiency',
       'Cropland phosphorus use efficiency',
       'Cropland potassium use efficiency'], dtype=object)

In [24]:
# deterimining units for these elements
unique_units = filtered_data_2[['Element', 'Unit']]
unique_units.reset_index(inplace=True)

pairs = {}
for i in range(len(unique_units)):
    pairs[unique_units['Element'][i]] = unique_units['Unit'][i]

units_df = pd.DataFrame(list(pairs.items()), columns = ['Element', 'Unit'])
units_df

Unnamed: 0,Element,Unit
0,Cropland nitrogen per unit area,kg/ha
1,Cropland phosphorus per unit area,kg/ha
2,Cropland potassium per unit area,kg/ha
3,Cropland nitrogen use efficiency,%
4,Cropland phosphorus use efficiency,%
5,Cropland potassium use efficiency,%


In [16]:
# Restructuring data from long-format to wide-format
pivoted_data = filtered_data_2.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Element',
    values = 'Value'
)

# Resetting row index
pivoted_data.reset_index(inplace=True)

# Setting column index name to None
pivoted_data.columns.name = None

pivoted_data.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Cropland nitrogen per unit area,Cropland nitrogen use efficiency,Cropland phosphorus per unit area,Cropland phosphorus use efficiency,Cropland potassium per unit area,Cropland potassium use efficiency
0,1,Armenia,1992,1992,39.3821,31.7868,4.3315,45.4498,6.576,64.8066
1,1,Armenia,1993,1993,33.5037,35.335,2.3616,60.4404,2.738,81.0745
2,1,Armenia,1994,1994,19.3473,45.1713,2.1764,59.3714,1.3217,90.1233
3,1,Armenia,1995,1995,19.879,45.5678,1.5805,67.906,0.2278,98.1816
4,1,Armenia,1996,1996,19.9442,49.0195,0.5946,86.5197,-1.1704,110.1389


In [18]:
pivoted_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13181 entries, 0 to 13180
Data columns (total 10 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Area Code                           13181 non-null  int64  
 1   Area                                13181 non-null  object 
 2   Year Code                           13181 non-null  int64  
 3   Year                                13181 non-null  int64  
 4   Cropland nitrogen per unit area     13181 non-null  float64
 5   Cropland nitrogen use efficiency    13181 non-null  float64
 6   Cropland phosphorus per unit area   13181 non-null  float64
 7   Cropland phosphorus use efficiency  13181 non-null  float64
 8   Cropland potassium per unit area    13181 non-null  float64
 9   Cropland potassium use efficiency   13181 non-null  float64
dtypes: float64(6), int64(3), object(1)
memory usage: 1.0+ MB


In [17]:
# Renaming columns
cleaned_data = pivoted_data.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Cropland nitrogen per unit area': 'NB_N2_per_unit_cropland_area',
        'Cropland nitrogen use efficiency': 'cropland_N2_use_efficiency',
        'Cropland phosphorus per unit area': 'NB_P2O5_per_unit_cropland_area',
        'Cropland phosphorus use efficiency': 'cropland_P2O5_use_efficiency',
        'Cropland potassium per unit area': 'NB_K2O_per_unit_cropland_area',
        'Cropland potassium use efficiency': 'cropland_K2O_use_efficiency'
    }
)
cleaned_data.head()

Unnamed: 0,area_code,area,year_code,year,NB_N2_per_unit_cropland_area,cropland_N2_use_efficiency,NB_P2O5_per_unit_cropland_area,cropland_P2O5_use_efficiency,NB_K2O_per_unit_cropland_area,cropland_K2O_use_efficiency
0,1,Armenia,1992,1992,39.3821,31.7868,4.3315,45.4498,6.576,64.8066
1,1,Armenia,1993,1993,33.5037,35.335,2.3616,60.4404,2.738,81.0745
2,1,Armenia,1994,1994,19.3473,45.1713,2.1764,59.3714,1.3217,90.1233
3,1,Armenia,1995,1995,19.879,45.5678,1.5805,67.906,0.2278,98.1816
4,1,Armenia,1996,1996,19.9442,49.0195,0.5946,86.5197,-1.1704,110.1389


In [18]:
cleaned_data['NB_N2_per_unit_cropland_area'].describe()

count    13181.000000
mean        88.279620
std        572.929663
min       -566.637900
25%          5.698100
50%         22.662900
75%         61.978600
max      13604.227000
Name: NB_N2_per_unit_cropland_area, dtype: float64

In [19]:
cleaned_data['NB_P2O5_per_unit_cropland_area'].describe()

count    13181.000000
mean        10.760125
std         36.866112
min        -54.782100
25%         -0.556700
50%          2.531600
75%          9.591400
max        656.060000
Name: NB_P2O5_per_unit_cropland_area, dtype: float64

In [25]:
# exporting cleaned data as csv file
cleaned_data.to_csv('cleaned_datasets/cropland_nutrient_balance_cleaned.csv', index=False)