# ADA Project - Insight into world agriculture production and its links to global hunger

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import folium

## Load data  into a Pandas dataframe

In [None]:
complete_dataset = pd.read_csv('../data/fao_data_crops_data.csv')

In [None]:
# We split the data and metadata and store them in 'crops' and 'flags' dataframe, respecitvely.
crops = complete_dataset.loc[:2255342].copy() 
flags = complete_dataset.loc[2255344:2255348].copy() 
# 'flags' contains correspondance list of acronyms that describe how a given sample was acquired --> only informative
flags.drop(['element','year','unit','value','value_footnotes','category'], axis=1, inplace = True) 
flags.rename(columns={'country_or_area':'acronym', 'element_code':'description'}, inplace=True) 
flags.set_index('acronym', inplace=True)
flags

## Exploratory data analysis

In [None]:
crops.head()

In [None]:
print("Size of the DataFrame: {s}\n".format(s=crops.shape))
print("Variable types present in DataFrame: \n{t}".format(t=crops.dtypes))

In [None]:
# List all the different footnotes values present in the dataset
footnotes = crops['value_footnotes'].unique() 
print(footnotes)
# Display dataframe that only contains one given value of 'value_footnotes'
display(crops.query('value_footnotes==@footnotes[4]')) 
# Return dataframe that only contains samples having NaN as value for 'value_footnotes'
crops[crops.value_footnotes.isnull()] 

In [None]:
print(crops['element'].unique())
print(crops['year'].unique())
print(crops['unit'].unique())
print(crops['category'].unique())
print(crops['element_code'].unique())
print(crops['country_or_area'].unique())

## Data preprocessing

In [None]:
# Returns a boolean of whether a column contains NaN (True) or not (False).
print(crops.isnull().values.any(axis=0)) 

# Drop rows which contain only missing values.
crops.dropna(how='all', inplace=True) 



In [None]:
# We drop the samples where 'value' is unknown (NaN) because they are of no utility    
crops.dropna(subset=['value', 'value_footnotes'], inplace=True) 

# Let's drop also all the samples that have 'NR' as a 'value_footnotes' value or 0 as 'value'
crops.drop(index=crops[crops['value_footnotes'].str.contains('NR')].index, inplace=True)
crops.drop(index=crops[crops['value']==0].index, inplace=True)


Separate regions from countries

In [None]:
regions_bool = crops['country_or_area'].str.contains('\+')
crops_regions = crops[regions_bool].copy()
crops_countries = crops[~regions_bool].copy()
crops_countries[crops_countries.country_or_area.str.contains('China')].tail(10)

In [None]:
crops_countries_by_country_year = crops_countries.groupby(['country_or_area', 'element']) \
                            .agg({'value':'mean'}) \
                            .rename(columns={'value':'mean_'}) 
                            #.sort_values(by='value',ascending=False)
crops_countries_by_country_year

In [None]:
area_harvested = crops_countries_by_country_year.loc[(slice(None),'Area Harvested'), :]
#area_harvested.mean_.argmax()
area_harvested.loc['United States of America']

#### Create a map showing yield by country (average over all years) 

In [None]:
yield_df= crops_countries_by_country_year.loc[(slice(None),'Yield'), :]
log_yield_df=pd.DataFrame(yield_df.mean_.map(lambda x:np.log(x)))
log_yield_df.head()


In [None]:
m = folium.Map(location=[48, -102], zoom_start=3)

world_geo = 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json'
Bins = list(yield_df.mean_.quantile([0, 0.25, 0.5, 0.75, 1]))

m = folium.Map(zoom_start=3)

folium.Choropleth(
    geo_data=world_geo,
    name='choropleth',
    data=log_yield_df,
    columns=[log_yield_df.index.get_level_values(level='country_or_area').values,'mean_'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='yield',
    #bins = Bins,
    reset=True
).add_to(m)

folium.LayerControl().add_to(m)

m

#### Area Hervested (mean)/Area Hervested (2018) by country --> Kind of average by year...


Not really sure about how we could interpret this...

In [None]:
surface_country = pd.read_csv('../data/API_AG.LND.TOTL.K2_DS2_en_csv_v2_422954.csv', skiprows=3)
surface_country.set_index('Country Name', inplace = True)
crops_countries_area = area_harvested.join(surface_country['2018'], on='country_or_area', how='left')
crops_countries_area['ratio'] = area_harvested['mean_']/(crops_countries_area['2018']*100)
crops_countries_area.dropna(inplace=True)
#plt.bar(crops_countries_area_mean_nobermuda.index.get_level_values(level='country_or_area').values, height='ratio')
#plt.show()
pl = crops_countries_area.plot(kind="bar", 
                             y="ratio", 
                            figsize=(30, 7), alpha=0.5, color="olive")

In [None]:
crops_countries_area.head()

#### Create a map showing this ratio by country

In [None]:
crops_countries_area_df=pd.DataFrame(crops_countries_area.ratio)
crops_countries_area_df.head()
log_df=pd.DataFrame(crops_countries_area_df.ratio.map(lambda x:np.log(x)))
type(log_df)

In [None]:
m = folium.Map(location=[48, -102], zoom_start=3)

world_geo = 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json'
Bins = list(crops_countries_area.ratio.quantile([0, 0.25, 0.5, 0.75, 1]))

m = folium.Map(zoom_start=3)

folium.Choropleth(
    geo_data=world_geo,
    name='choropleth',
    data=log_df,
    columns=[crops_countries_area.index.get_level_values(level='country_or_area').values,'ratio'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='country surface vs surface harvested ratio',
    #bins = Bins,
    reset=True
).add_to(m)

folium.LayerControl().add_to(m)

m

In [None]:
crops_countries_by_country_by_category = crops_countries.groupby(['country_or_area','element', 'category'])
crops_countries_by_country_by_category = pd.DataFrame(data=crops_countries_by_country_by_category.value.sum().reset_index(name='total').sort_values(by='total',ascending=False))

crops_countries_by_country_by_category.head(10)

  
 ## What are the principal foodstuffs produced in each country/region of the world? And which countries are the biggest producers for a given food?

In [None]:
main_product=crops_countries_by_country_by_category.drop_duplicates(subset='country_or_area', keep='first')
main_product.head(10)

<div class="alert alert-block alert-alert">
    
  We decide to keep only the Production quantity.

In [None]:
ind_keep=pd.Series(main_product.element.str.contains('Production Quantity'))
ind_keep=ind_keep[ind_keep].index

main_product_quantity = main_product.drop(index=main_product.index.difference(ind_keep))
main_product_quantity.head(10)

In [None]:
main_product_quantity.category.unique()

<div class="alert alert-block alert-alert">
    In the previous dataframe (main_product_quantity) we show the category that is most produced by each country in term of production quantity. 
    

 <div class="alert alert-block alert-info">
    
   In this dataframe, we can see that China and the United States are the main producers of cereals, Canada is the main producer of cereals_rice_milled. Nigeria and Poland are the mais producers of roots and tubbers. Philippines mainly produces sugar cane and Malesia mainly produces oil_palm_fruit.
 


## Are all countries equal in terms of diversity of foodstuffs harvested?
    

In [None]:
food_diversity = pd.DataFrame(crops_countries.groupby(['country_or_area','category'])['category'].count().reset_index(name='total'))
food_diversity.head(10)

In [None]:
food_diversity = pd.DataFrame(food_diversity.groupby(['country_or_area']).country_or_area.size().reset_index(name='categoty_diversity'))

In [None]:
food_diversity.head(10)

In [None]:
food_producer = pd.DataFrame(crops_countries.groupby(['category']))#['category'].count().reset_index(name='total'))
food_producer.head(10)

Pas finie cette step?

## Trying my best to enable interactive vizualization throughout years


#### Interactive visualization plot #1


In [None]:
#TO RUN THIS: with conda --> conda install -c conda-forge ipywidgets
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
#To enable interactive viz on lab --> conda install nodejs
#                                  + jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
#Interactive visualization
def viz_year(year, element, category):
    df_to_plot = crops_regions.loc[crops_regions['year'] == year].loc[crops_regions['element'] == element].loc[crops_regions['category'] == category]
    df_to_plot.plot(kind = 'bar', x='country_or_area', y='value',figsize=(20,10))
    plt.title(f'{element} values of {category} by regions for year {year}', fontsize=20)
    plt.xlabel("Regions", fontsize= 20)
    plt.ylabel("Values", fontsize= 20)
    plt.show()

years = crops_regions.year.unique()    
elements = crops_regions.element.unique()
categories = crops_regions.category.unique()
interact(viz_year, year=years, element = elements , category=categories)    

We can see that for some categories there are no numerical values to plot --> Remove them from dataframe? --> only for some categories (eg cereals_total)


#### Interactive visualization plot #2

Show production/yield(interactive selection).. of a given category (interactive choice) by a given country (interactive selection) throughout years

In [None]:
#Interactive visualization
def viz_evolution(country, element, category):
    df_to_plot = crops_countries.loc[crops_countries['country_or_area'] == country].loc[crops_countries['element'] == element].loc[crops_countries['category'] == category]
    df_to_plot.plot(x='year', y='value',figsize=(20,10))
    plt.title(f'{element} values of {category} in {country} throughout years', fontsize= 20)
    plt.xlabel("Year", fontsize= 20)
    plt.ylabel("Values", fontsize= 20)
    plt.show()

countries = crops_countries.country_or_area.unique()    
elements = crops_countries.element.unique()
categories = crops_countries.category.unique()
interact(viz_evolution, country=countries, element = elements , category=categories)    