## FDW Crop Production Data Profiling - Ethiopia

In [1]:
import os, sys, glob, json
from itertools import product, compress, chain
from functools import reduce
import warnings
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
from tools import save_hdf, PrintAdminUnits, PlotAdminShapes
from tools import FDW_PD_Sweeper, FDW_PD_AvalTable, FDW_PD_Compiling, FDW_PD_GrainTypeAgg, FDW_PD_ValidateFnidName
from tools import FDW_PD_CreateAdminLink, FDW_PD_RatioAdminLink, FDW_PD_ConnectAdminLink
from tools_graphic import PlotBarProduction, PlotLinePAY, PlotHeatCropSystem, PlotHeatSeasonData
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

In [3]:
# CPCV2 grain code ------------------------------ #
grain_code = pd.read_hdf('./data/crop/grain_cpcv2_code.hdf')
product_category = grain_code[['product', 'product_category']].set_index('product').to_dict()['product_category']
# ----------------------------------------------- #

# Load FEWS NET administrative boundaries ------- #
epsg = 'epsg:32637'
fn_shapes = sorted(glob.glob('./data/shapefile/fewsnet/ET_Admin?_????.shp'))
shape_all = []
for fn in fn_shapes:
    name = fn[-18:-4]
    exec('%s = gpd.read_file("%s").to_crs("%s")' % (name, fn, epsg))
    exec('%s["area"] = %s["geometry"].area/10**6' % (name, name))
    exec('shape_all.append(%s)' % (name))
shape_all = pd.concat(shape_all, axis=0).reset_index(drop=True)
PrintAdminUnits(shape_all)
# ----------------------------------------------- #

# FDW API host address -------------------------- #
host = 'https://fdw.fews.net'
auth = tuple(json.loads(open('token.json', "r").read()))
parameters = {
    'format': 'json',
    'country': 'Ethiopia',
    'product': 'R011',
    'survey_type': 'crop:best'
}
endpoint = '/api/cropproductionindicatorvalue/'
response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
response.raise_for_status()
df = pd.DataFrame.from_records(response.json())
df_origin = df.copy()
# ----------------------------------------------- #

# Manual Pre-processing before Sweeping --------- #
# 1. Default setting 
# a) None-type population group
df.loc[df['population_group'].isna(), 'population_group'] = 'none'
df.loc[df['population_group'] == '', 'population_group'] = 'none'
# 2. Manual setting
# a) Drop Admin-1 data having population groups: ET1994A103, ET1994A107, ET1994A117
df = df.loc[df['population_group'] == 'none']
# b) Fix "Maize Grain (Fresh)" which is 10 times larger than reported.
# *** No "Maize (Corn)" records in "Meher 2017", and "Maize Grain (Fresh)" appear only in recent two years: "Meher 2017", "Meher 2019"
# *** It is comfirmed that all "Maize (Corn)" records in 2019 are overlapped with "Maize Grain (Fresh)".
# *** So, we will remove "Maize (Corn)" and fix "Maize Grain (Fresh)"
# Remove "Maize (Corn)" during 2017-2019.
df = df[~(
    (df['status']=='Collected') & 
    (df['cpcv2'] == 'R01122AA') & 
    (df['season_year'].isin(['Meher 2017','Meher 2018','Meher 2019']))
)]
# Fix 10 times larger values (this applies to other grain types as well)
# Here, we use conditions rather than IDs for the cases of mixed IDs.
records = df.loc[(
    (df['status']=='Collected') & 
    (df['season_year'].isin(['Meher 2017','Meher 2018','Meher 2019'])) &
    (df['indicator'] == 'Yield') &
    (df['value'] > 10)
), ['fnid','season_year','product']].drop_duplicates()
for i, row in records.iterrows():
    fnid, season_year, product_name = row[['fnid','season_year','product']]
    idx = (
        (df['fnid'] == fnid) &
        (df['season_year'] == season_year) &
        (df['product'] == product_name)
    )
    df.loc[idx & (df['indicator'] == 'Yield'), 'value'] /= 10 
    df.loc[idx & (df['indicator'] == 'Quantity Produced'), 'value'] /= 10 
# ----------------------------------------------- #

# FDW Production Data Inspection ---------------- #
df, df_raw = FDW_PD_Sweeper(df)
table_dict = FDW_PD_AvalTable(df, shape_all)
# ----------------------------------------------- #

# FEWS NET Shapefile comparison ----------------- #
shape_used = pd.concat([
    ET_Admin2_1994, ET_Admin2_2001, ET_Admin2_2003, 
    ET_Admin2_2007, ET_Admin2_2008, ET_Admin2_2014, 
    ET_Admin2_2019
], axis=0)
PlotAdminShapes(shape_used, label=False)
# ----------------------------------------------- #

- FEWS NET admin shapefiles ------------------- #
        Admin1  # units    Admin2  # units
year                                      
1994  ET1994A1       11  ET1994A2       66
2001  ET2001A1       11  ET2001A2       68
2003  ET2003A1       11  ET2003A2       73
2007  ET2007A1       11  ET2007A2       75
2008  ET2008A1       11  ET2008A2       80
2014  ET2014A1       11  ET2014A2       77
2019  ET2019A1       11  ET2019A2       84
2020  ET2020A1       11  ET2020A2       91
----------------------------------------------- #
- Remove missing records ---------------------- #
Orignial data points: 22,385
Removed 37 "Collected" points
Removed 5,185 "Missing Historic Data" points
Removed 1,059 "Not Collected" points
0/5,482 "Area Harvested" points are retained.
5,584/5,632 "Area Planted" points are retained.
5,281/5,637 "Quantity Produced" points are retained.
5,239/5,634 "Yield" points are retained.
Current data points: 16,104

- Minor changes are applied.. ----------------- #

- Basic inf

- Ethiopia crop seasonal calendar

![FEWS NET](https://fews.net/sites/default/files/styles/large/public/seasonal-calendar-ethiopia.png?itok=lMgNmg30)

- FDW data consists of `ET1994A1`, `ET1994A2`, `ET2001A1`, `ET2001A2`, `ET2003A1`, `ET2003A2`, `ET2007A2`, `ET2008A1`, `ET2008A2`, `ET2014A1`, `ET2014A2`, and `ET2019A2`.

| Year | Admin-1 | # units  | Admin-2  | # units |
| :---: | :----:  | :----:   | :----:   | :---:  |
| 1994 | **`ET1994A1`**| 11        | **`ET1994A2`** | 66      |
| 2001 | **`ET2001A1`**| 11        | **`ET2001A2`** | 68      |
| 2003 | **`ET2003A1`**| 11       | **`ET2003A2`** | 73       |
| 2007 | ET2007A1| 11       | **`ET2007A2`** | 75     |
| 2008 | **`ET2008A1`**| 11        | **`ET2008A2`** | 80      |
| 2014 | **`ET2014A1`**| 11        | **`ET2014A2`** | 77      |
| 2019 | ET2019A1| 11        | **`ET2019A2`** | 84      |
| 2020 | ET2020A1| 11        | ET2020A2      | 91|

- Comparison between admin boundaries</br>
![image](./data/shapefile/fewsnet/figures/ET_admin_shapes.png "Ethiopia")

- Ethiopia has two crop seasons: `Meher (10-01)`, `Belg (06-01)`.
- Ethiopia has one crop production systems: `None`.
- `The number of admin-2 districts reduced in 2014 (ET2014A2).`
- **`ET2019A2`** is used to represent all admin-level 2 crop data.

### Manual modification of original data
- Drop Admin-1 data having population groups: ET1994A103, ET1994A107, ET1994A117.
- Fix Admin-2 data having 10 times larger quantity produced and yield.
- [`Basketo`](https://en.wikipedia.org/wiki/Basketo_special_woredahttps://en.wikipedia.org/wiki/Basketo_special_woreda), which is a special woreda, has been recorded during 2001-2018 and removed in the 2019 shapefile (`ET_Admin2_2019.shp`). However, it's production data is still reported in 2019 FDW data (FNID: `ET2019A20718`).
- 23 districts of ET2019A2 have slightly different names compared to ET2014A2 and ET_adm2_2019.shp. We replaced those names with ET_adm2_2019.shp.

In [4]:
# Define the latest shapefile ------------------- #
latest_level = 2
shape_latest = ET_Admin2_2019.copy().to_crs('epsg:4326')
# ----------------------------------------------- #

# Validation of FNIDs and Names ----------------- #
df = FDW_PD_ValidateFnidName(df, shape_used, shape_latest)
# ----------------------------------------------- #

# FDW Production Data Compiling ----------------- #
area, prod = FDW_PD_Compiling(df, shape_used)
area_all, prod_all = area.copy(), prod.copy()
mdx_pss = area.columns.droplevel([0,1]).unique()
# ----------------------------------------------- #

ET2019A20717:	"Dawuro" (FDW) is changed to "Dawro" (shapefile).
ET2019A21204:	"Etang Special" (FDW) is changed to "Itang" (shapefile).
ET2019A20420:	"Finfine Special" (FDW) is changed to "Finfinne Special" (shapefile).
ET2019A20701:	"Guraghe" (FDW) is changed to "Gurage" (shapefile).
ET2019A20721:	"Halaba Special" (FDW) is changed to "Alaba" (shapefile).
ET2019A20419:	"Horo Gudru Wellega" (FDW) is changed to "Horo Guduru" (shapefile).
ET2019A20403:	"Ilu Aba Bora" (FDW) is changed to "Ilubabor" (shapefile).
ET2019A20709:	"Kefa" (FDW) is changed to "Keffa" (shapefile).
ET2019A20703:	"Kembata Tibaro" (FDW) is changed to "Kembata Tembaro" (shapefile).
ET2019A20719:	"Konta Special" (FDW) is changed to "Konta" (shapefile).
ET2019A20605:	"Mao Komo Special" (FDW) is changed to "Mao-Komo" (shapefile).
ET2019A20303:	"North Wello" (FDW) is changed to "North Wollo" (shapefile).
ET2019A20713:	"Segen Area P." (FDW) is changed to "Segen" (shapefile).
ET2019A20506:	"Shabelle" (FDW) is changed to "Sheb

In [3]:
# Define the latest shapefile ------------------- #
shape_latest = ET_adm2_2019.copy().to_crs('epsg:4326')
# ----------------------------------------------- #

# Validation of FNIDs and Names ----------------- #
# Compare FNIDs and names between FDW data and the FEWS NET's shapfiles.
name_data = df.loc[df['fnid'].apply(lambda x: x.startswith('A2',6)), ['fnid','admin_2']].drop_duplicates()
name_shape = shape_all[['FNID','ADMIN2']]
# - Check all FNIDs exist in the shapefiles.
fnid_not_in_shape = name_data[~name_data['fnid'].isin(name_shape['FNID'])]
# assert len(fnid_not_in_shape) == 0  # 'Basketo (ET2019A20718)'
name_data = name_data[name_data['fnid'].isin(name_shape['FNID'])]
# - Check all names are matched between FDW and shapefiles
name_shape_matched = name_shape.set_index('FNID').loc[name_data['fnid']]
differ = name_shape_matched['ADMIN2'] != name_data['admin_2'].values
# assert differ.sum() == 0 
# Replace names
name_replace = pd.merge(name_shape_matched[differ], name_data[differ.values], 
                        left_index=True, right_on='fnid')
name_replace_dict = name_replace[['admin_2','ADMIN2']].set_index('admin_2').to_dict()['ADMIN2']
name_replaced = df.loc[df['fnid'].isin(name_replace['fnid']),'admin_2'].replace(name_replace_dict)
df.loc[df['fnid'].isin(name_replace['fnid']),'admin_2'] = name_replaced.values
# Define administrative names
for gdf in [shape_all, shape_latest]:
    fdx = gdf.FNID.apply(lambda x: x[7] == '1'); gdf.loc[fdx,'name'] = gdf.loc[fdx, 'ADMIN1']
    fdx = gdf.FNID.apply(lambda x: x[7] == '2'); gdf.loc[fdx,'name'] = gdf.loc[fdx, 'ADMIN2']
# ----------------------------------------------- #

# Transition links ------------------------------ #
adm_link_2001 = CreateLinkAdmin(ET_adm2_1994, ET_adm2_2001, old_on='ADMIN2', new_on='ADMIN2')[0]
adm_link_2003 = CreateLinkAdmin(ET_adm2_2001, ET_adm2_2003, old_on='ADMIN2', new_on='ADMIN2')[0]
adm_link_2007 = CreateLinkAdmin(ET_adm2_2003, ET_adm2_2007, old_on='ADMIN2', new_on='ADMIN2')[0]
adm_link_2008 = CreateLinkAdmin(ET_adm2_2007, ET_adm2_2008, old_on='ADMIN2', new_on='ADMIN2')[0]
adm_link_2014 = CreateLinkAdmin(ET_adm2_2008, ET_adm2_2014, old_on='ADMIN2', new_on='ADMIN2')[0]
adm_link_2019 = CreateLinkAdmin(ET_adm2_2014, ET_adm2_2019, old_on='ADMIN2', new_on='ADMIN2')[0]
adm_link = CreateNestedLinks({
    **adm_link_2001, **adm_link_2003, 
    **adm_link_2007, **adm_link_2008, 
    **adm_link_2014, **adm_link_2019, 
})
# ----------------------------------------------- #

# FDW Production Data Compiling ----------------- #
area, prod, area_all, prod_all = FDW_PD_Compiling(df, shape_all, shape_latest, 
                                                               adm_link=adm_link, 
                                                               areal_weight=True)
# ----------------------------------------------- #
# area.columns.set_levels(['Annual'], level=3, inplace=True)
# prod.columns.set_levels(['Annual'], level=3, inplace=True)

# Aggregate grain data by grain type ------------ #
[area, prod, area_all, prod_all] = FDW_PD_GrainTypeAgg([area, prod, area_all, prod_all], product_category)
# ----------------------------------------------- #

# # Manual correction ----------------------------- #
# crop = prod/area
# # ----------------------------------------------- #

# Complete long format DataFrame
df_area = area.T.stack().reset_index().rename({0:'value'},axis=1)
df_area['indicator'] = 'area'
df_prod = prod.T.stack().reset_index().rename({0:'value'},axis=1)
df_prod['indicator'] = 'production'
df_yield = (prod/area).T.stack().reset_index().rename({0:'value'},axis=1)
df_yield['indicator'] = 'yield'
stack = pd.concat([df_area, df_prod, df_yield], axis=0)
# Insert a country name
stack['country'] = 'Ethiopia'
stack = stack[['fnid','country','name','product','year','season_name','season_date','indicator','value']]
stack = stack.reset_index(drop=True)
# Change season_date to harvest_end
stack.rename(columns={'season_date':'harvest_end'},inplace=True)
stack['harvest_end'] = stack['harvest_end'].replace({
    '10-01':'10-01', # Meher
    '06-01':'06-01', # Belg
})

# Save data
save_hdf('./data/crop/adm_crop_production_raw_ET.hdf', df)
save_hdf('./data/crop/adm_crop_production_ET.hdf', stack)
# # File to Seth Peterson
# stack_maize = stack[stack['product'] == 'Maize'].reset_index(drop=True)
# stack_maize.to_csv('./data/crop/adm_crop_production_ET_Maize.csv')
# print('./data/crop/adm_crop_production_ET_Maize.csv is saved.')

- Aggregation of grain types ------------------ #
10 crops: Barley (Unspecified), Barley (White), Maize (Corn), Maize Grain (Fresh), Millet (Finger), Mixed Teff, Oats (Unspecified), Rice (Paddy), Sorghum, Wheat Grain
8 crops: Barley, Maize, Millet, Oats, Rice, Sorghum, Teff, Wheat

./data/crop/adm_crop_production_raw_ET.hdf is saved.
./data/crop/adm_crop_production_ET.hdf is saved.
