In [None]:
# required packages
import pandas as pd
import numpy as np

In [None]:
# Loading data
raw_data = pd.read_csv('Datasets_MS_Project/Pesticides/Pesticides_Use_E_All_Data_(Normalized)/Inputs_Pesticides_Use_E_All_Data_(Normalized).csv')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,3,'008,Albania,1357,Pesticides (total),5157,Agricultural Use,1990,1990,t,121.0,I,Imputed value
1,3,'008,Albania,1357,Pesticides (total),5157,Agricultural Use,1991,1991,t,121.0,I,Imputed value
2,3,'008,Albania,1357,Pesticides (total),5157,Agricultural Use,1992,1992,t,121.0,I,Imputed value
3,3,'008,Albania,1357,Pesticides (total),5157,Agricultural Use,1993,1993,t,121.0,A,Official figure
4,3,'008,Albania,1357,Pesticides (total),5157,Agricultural Use,1994,1994,t,201.0,A,Official figure


In [3]:
raw_data.shape

(100004, 13)

In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Area Code        100004 non-null  int64  
 1   Area Code (M49)  100004 non-null  object 
 2   Area             100004 non-null  object 
 3   Item Code        100004 non-null  int64  
 4   Item             100004 non-null  object 
 5   Element Code     100004 non-null  int64  
 6   Element          100004 non-null  object 
 7   Year Code        100004 non-null  int64  
 8   Year             100004 non-null  int64  
 9   Unit             100004 non-null  object 
 10  Value            100004 non-null  float64
 11  Flag             100004 non-null  object 
 12  Note             100004 non-null  object 
dtypes: float64(1), int64(5), object(7)
memory usage: 9.9+ MB


In [None]:
# Removing unwanted columns 
columns_to_remove = ['Area Code (M49)', 'Element Code', 'Unit', 'Flag', 'Note']
filtered_data_1 = raw_data.drop(columns_to_remove, axis = 1)
filtered_data_1.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element,Year Code,Year,Value
0,3,Albania,1357,Pesticides (total),Agricultural Use,1990,1990,121.0
1,3,Albania,1357,Pesticides (total),Agricultural Use,1991,1991,121.0
2,3,Albania,1357,Pesticides (total),Agricultural Use,1992,1992,121.0
3,3,Albania,1357,Pesticides (total),Agricultural Use,1993,1993,121.0
4,3,Albania,1357,Pesticides (total),Agricultural Use,1994,1994,201.0


In [6]:
filtered_data_1['Item'].unique()

array(['Pesticides (total)', 'Insecticides',
       'Insecticides – Chlorinated Hydrocarbons',
       'Insecticides – Organo-phosphates', 'Insecticides – Carbamates',
       'Insecticides – Pyrethroids', 'Insecticides - nes', 'Herbicides',
       'Herbicides – Phenoxy hormone products', 'Herbicides – Triazines',
       'Herbicides – Amides', 'Herbicides – Carbamates',
       'Herbicides – Dinitroanilines', 'Herbicides – Sulfonyl ureas',
       'Herbicides – Bipiridils', 'Herbicides - nes',
       'Fungicides and Bactericides', 'Fung & Bact – Inorganics',
       'Fung & Bact – Dithiocarbamates', 'Fung & Bact – Benzimidazoles',
       'Fung & Bact – Triazoles, diazoles', 'Fung & Bact - nes',
       'Plant Growth Regulators', 'Rodenticides',
       'Rodenticides – Anti-coagulants', 'Rodenticides – Other',
       'Other Pesticides nes', 'Insecticides - Biopesticides',
       'Herbicides – Urea derivates', 'Herbicides – Uracil',
       'Fung & Bact – Diazines, morpholines',
       'Rodentic

Since the analysis revolves mostly around producer prices, we will consider just
total pesticide use from the list of different items. This is because we are not 
doing pesticide specific analysis. Total pesticide use is most relevant when we 
trying to predict producer prices.

In [7]:
filtered_data_1['Element'].unique()

array(['Agricultural Use', 'Use per area of cropland', 'Use per capita',
       'Use per value of agricultural production'], dtype=object)

For our analysis, we would consider total pesticide use for all of the four different 
elements- for agricultural use, use per area of cropland, etc.

In [8]:
raw_data.loc[raw_data['Element']=='Use per value of agricultural production',].head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
99,3,'008,Albania,1357,Pesticides (total),5173,Use per value of agricultural production,1990,1990,g/Int$,0.12,E,Estimated Value
100,3,'008,Albania,1357,Pesticides (total),5173,Use per value of agricultural production,1993,1993,g/Int$,0.11,E,Estimated Value
101,3,'008,Albania,1357,Pesticides (total),5173,Use per value of agricultural production,1994,1994,g/Int$,0.17,E,Estimated Value
102,3,'008,Albania,1357,Pesticides (total),5173,Use per value of agricultural production,1995,1995,g/Int$,0.2,E,Estimated Value
103,3,'008,Albania,1357,Pesticides (total),5173,Use per value of agricultural production,1996,1996,g/Int$,0.2,E,Estimated Value


In [24]:
unique_units = raw_data[['Element', 'Unit']]
unique_units.head()

Unnamed: 0,Element,Unit
0,Agricultural Use,t
1,Agricultural Use,t
2,Agricultural Use,t
3,Agricultural Use,t
4,Agricultural Use,t


In [None]:
# finding units for different elements
pairs = {}
for i in range(len(unique_units)):
    pairs[unique_units['Element'][i]] = unique_units['Unit'][i] 
print(pd.DataFrame(list(pairs.items()), columns=['Element','Unit']))

                                    Element    Unit
0                          Agricultural Use       t
1                  Use per area of cropland   kg/ha
2                            Use per capita  kg/cap
3  Use per value of agricultural production  g/Int$


As we can notice, each element has a different unit. So they are on different scales.

In [9]:
filtered_data_2 = filtered_data_1.loc[filtered_data_1['Item']=='Pesticides (total)',]
filtered_data_2.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element,Year Code,Year,Value
0,3,Albania,1357,Pesticides (total),Agricultural Use,1990,1990,121.0
1,3,Albania,1357,Pesticides (total),Agricultural Use,1991,1991,121.0
2,3,Albania,1357,Pesticides (total),Agricultural Use,1992,1992,121.0
3,3,Albania,1357,Pesticides (total),Agricultural Use,1993,1993,121.0
4,3,Albania,1357,Pesticides (total),Agricultural Use,1994,1994,201.0


In [None]:
# restructuring data from long to wide format
pivoted_data = filtered_data_2.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Element',
    values = 'Value'
)
pivoted_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Element,Agricultural Use,Use per area of cropland,Use per capita,Use per value of agricultural production
Area Code,Area,Year Code,Year,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Armenia,1992,1992,8.0,0.02,0.0,
1,Armenia,1993,1993,8.0,0.02,0.0,
1,Armenia,1994,1994,23.0,0.05,0.01,
1,Armenia,1995,1995,23.9,0.05,0.01,
1,Armenia,1996,1996,32.0,0.06,0.01,


In [None]:
# resetting row index
pivoted_data = pivoted_data.reset_index()
pivoted_data.head()

Element,Area Code,Area,Year Code,Year,Agricultural Use,Use per area of cropland,Use per capita,Use per value of agricultural production
0,1,Armenia,1992,1992,8.0,0.02,0.0,
1,1,Armenia,1993,1993,8.0,0.02,0.0,
2,1,Armenia,1994,1994,23.0,0.05,0.01,
3,1,Armenia,1995,1995,23.9,0.05,0.01,
4,1,Armenia,1996,1996,32.0,0.06,0.01,


In [None]:
# setting column index to None
pivoted_data.columns.name = None
pivoted_data.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Agricultural Use,Use per area of cropland,Use per capita,Use per value of agricultural production
0,1,Armenia,1992,1992,8.0,0.02,0.0,
1,1,Armenia,1993,1993,8.0,0.02,0.0,
2,1,Armenia,1994,1994,23.0,0.05,0.01,
3,1,Armenia,1995,1995,23.9,0.05,0.01,
4,1,Armenia,1996,1996,32.0,0.06,0.01,


In [14]:
len(pivoted_data['Area'].unique())

252

In [None]:
# total number of missing values for each column
pivoted_data.isna().sum()

Area Code                                     0
Area                                          0
Year Code                                     0
Year                                          0
Agricultural Use                              0
Use per area of cropland                    408
Use per capita                               99
Use per value of agricultural production    779
dtype: int64

In [16]:
pivoted_data.shape

(7949, 8)

In [None]:
# renaming columns
cleaned_data = pivoted_data.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Agricultural Use': 'total_pesticide_use_for_agriculture',
        'Use per area of cropland': 'total_pesticide_use_per_area_of_cropland',
        'Use per capita': 'total_pesticide_use_per_capita',
        'Use per value of agricultural production': 'total_pesticide_use_per_value_of_agri_production'
    }
)
cleaned_data.head()

Unnamed: 0,area_code,area,year_code,year,total_pesticide_use_for_agriculture,total_pesticide_use_per_area_of_cropland,total_pesticide_use_per_capita,total_pesticide_use_per_value_of_agri_production
0,1,Armenia,1992,1992,8.0,0.02,0.0,
1,1,Armenia,1993,1993,8.0,0.02,0.0,
2,1,Armenia,1994,1994,23.0,0.05,0.01,
3,1,Armenia,1995,1995,23.9,0.05,0.01,
4,1,Armenia,1996,1996,32.0,0.06,0.01,


In [None]:
# exporting cleaned dataset as csv file
cleaned_data.to_csv('cleaned_datasets/pesticides_cleaned.csv', index=False)