In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import folium

## Load data  into a Pandas dataframe

In [None]:
complete_dataset = pd.read_csv('data/fao_data_crops_data.csv')#, na_values=[0]) # The dataset contains metadata 
                                                                              # in the last 6 rows and we want to set 
                                                                              # '0' values to NaN as they are not useful 
                                                                              # for analysis



In [None]:
crops = complete_dataset.loc[:2255342].copy() # We split the data and metadata and store them in 'crops' and 'flags' dataframe, respecitvely.

flags = complete_dataset.loc[2255344:2255348].copy() # 'flags' contains correspondance list of acronyms 
                                                     #that describe how a given sample was acquired --> only informative
flags.drop(['element','year','unit','value','value_footnotes','category'], axis=1, inplace = True) 
flags.rename(columns={'country_or_area':'acronym', 'element_code':'description'}, inplace=True) 
flags.set_index('acronym', inplace=True)
flags

## Exploratory data analysis

In [None]:
crops.head()

In [None]:
print("Size of the DataFrame: {s}\n".format(s=crops.shape))
print("Variable types present in DataFrame: \n{t}".format(t=crops.dtypes))

In [None]:
footnotes = crops['value_footnotes'].unique() # List all the different footnotes values present in the dataset
print(footnotes)
display(crops.query('value_footnotes==@footnotes[4]')) # Display dataframe that only contains one given value of 'value_footnotes'
crops[crops.value_footnotes.isnull()] # Return dataframe that only contains samples having NaN as value for 'value_footnotes'

In [None]:
print(crops['element'].unique())
print(crops['year'].unique())
print(crops['unit'].unique())
print(crops['category'].unique())
print(crops['element_code'].unique())
print(crops['country_or_area'].unique())

## Data preprocessing

In [None]:
print(crops.isnull().values.any(axis=0)) # Returns a boolean of whether a column contains NaN (True) or not (False).
crops.dropna(how='all', inplace=True) # Drop rows which contain only missing values.



In [None]:
# We drop the samples where 'value' is unknown (NaN) because they are of no utility    
crops.dropna(subset=['value', 'value_footnotes'], inplace=True) 

# Let's drop also all the samples that have 'NR' as a 'value_footnotes' value or 0 as 'value'
crops.drop(index=crops[crops['value_footnotes'].str.contains('NR')].index, inplace=True)
crops.drop(index=crops[crops['value']==0].index, inplace=True)


In [None]:
crops_by_region_year = crops.groupby(['country_or_area', 'year']) \
                            .agg('sum') \
                            .sort_values(by='value',ascending=False)


In [None]:
crops_by_region_year_mean = crops_by_region_year.join(crops_by_region_year.mean(level='country_or_area').rename(columns={'value':'mean'}), on='country_or_area', how = 'left')


In [None]:
crops_by_region_year_mean.sort_values(by='mean',ascending=False)

In [None]:

crops_by_region_year_mean.index.get_level_values(level='country_or_area').unique()

In [None]:
plt.hist(crops_by_region_year_mean.mean, bins= [crops_by_region_year_mean.index.get_level_values(level='country_or_area').unique()])
