## FDW Crop Production Data Profiling - South Africa

In [1]:
import os, sys, glob, json
from itertools import product, compress, chain
from functools import reduce
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
from tools import save_hdf, save_npz, load_npz, PrintAdminUnits, PlotAdminShapes
from tools import FDW_PD_Sweeper, FDW_PD_AvalTable, FDW_PD_Compiling, FDW_PD_GrainTypeAgg, FDW_PD_ValidateFnidName
from tools import FDW_PD_CreateAdminLink, FDW_PD_RatioAdminLink, FDW_PD_ConnectAdminLink
from tools import FDW_PD_CaliSeasonYear
from tools_graphic import PlotBarProduction, PlotLinePAY, PlotHeatCropSystem, PlotHeatSeasonData
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

In [2]:
# CPCV2 grain code ------------------------------ #
grain_code = pd.read_hdf('./data/crop/grain_cpcv2_code.hdf')
product_category = grain_code[['product', 'product_category']].set_index('product').to_dict()['product_category']
# ----------------------------------------------- #

# Load FEWS NET administrative boundaries ------- #
epsg = 'EPSG:22234' # South Africa
fn_shapes = sorted(glob.glob('./data/shapefile/fewsnet/ZA_Admin?_????.shp'))
shape_all = []
for fn in fn_shapes:
    name = fn[-18:-4]
    exec('%s = gpd.read_file("%s").to_crs("%s")' % (name, fn, epsg))
    exec('%s["area"] = %s["geometry"].area/10**6' % (name, name))
    exec('shape_all.append(%s)' % (name))
shape_all = pd.concat(shape_all, axis=0).reset_index(drop=True)
PrintAdminUnits(shape_all)
# ----------------------------------------------- #

# FDW API host address -------------------------- #
host = 'https://fdw.fews.net'
auth = tuple(json.loads(open('token.json', "r").read()))
parameters = {
    'format': 'json',
    'country': 'South Africa',
    'product': 'R011',
    'survey_type': 'crop:best'
}
endpoint = '/api/cropproductionindicatorvalue/'
response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
response.raise_for_status()
df = pd.DataFrame.from_records(response.json())
df_origin = df.copy()
# ----------------------------------------------- #

# Save country level (admin_0) data
# I'm not sure the accuracy of admin_0 level data. 
# If it is country-level aggregation, the production values are too low.
# For example, ZA1994A0's annual maize production is less than 1 million metric ton.
df_country = df[df['fnid'] == 'ZA1994A0'].reset_index(drop=True)
df = df[df['fnid'] != 'ZA1994A0'].reset_index(drop=True)

- FEWS NET admin shapefiles ------------------- #
| year	 | Admin1   | # units   | Admin2   | # units   | Admin3   | # units   |
| 1994	 | ZA1994A1 | 9	| nan	| 0	| nan	| 0	|
----------------------------------------------- #


In [3]:
# Manual Pre-processing before Sweeping --------- #
# 1. Default setting 
# a) None-type crop production system
df.loc[df['crop_production_system'].isna(), 'crop_production_system'] = 'none'
df.loc[df['crop_production_system'] == '', 'crop_production_system'] = 'none'
# b) None-type population group
df.loc[df['population_group'].isna(), 'population_group'] = 'none'
df.loc[df['population_group'] == '', 'population_group'] = 'none'
# 2. Manual setting
# a) Add admin names to reporting units
admin_level = 'admin_1'
reporting_unit = df.loc[df['fnid'].map(lambda x: x[6] != 'A'), 'fnid'].unique()
df.loc[df.fnid.isin(reporting_unit),admin_level] = df.loc[df.fnid.isin(reporting_unit), 'locality_name'].map(lambda x: x.split(' ')[0])
# There are duplicate data points in Summer 1999 and Summer 2000 with different "publication_name" such as, 
# "CEC, 2001, South Afric", "FEWS NET, South Africa", and 'Official Database, 2016, South Africa'.
# We temporarily select data points updated recently (latest "collection_date").
rows = ['fnid','crop_production_system','season_year', 'product','indicator']
df['collection_date'] = pd.to_datetime(df['collection_date'])
df = df.sort_values(by=['collection_date']).drop_duplicates(rows, keep='last').reset_index(drop=True)
# Also, "season_date", "start_date", "period_date" are different by "publication_name"
df.loc[df['season_name'] == 'Summer', 'season_date'] = df.loc[df['season_name'] == 'Summer', 'season_date'].map(lambda x: x[:5]+'10-01')
df.loc[df['season_name'] == 'Summer', 'start_date'] = df.loc[df['season_name'] == 'Summer', 'start_date'].map(lambda x: x[:5]+'04-01')
df.loc[df['season_name'] == 'Summer', 'period_date'] = df.loc[df['season_name'] == 'Summer', 'period_date'].map(lambda x: x[:5]+'04-30')
df.loc[df['season_name'] == 'Winter', 'season_date'] = df.loc[df['season_name'] == 'Winter', 'season_date'].map(lambda x: x[:5]+'04-01')
df.loc[df['season_name'] == 'Winter', 'start_date'] = df.loc[df['season_name'] == 'Winter', 'start_date'].map(lambda x: x[:5]+'11-01')
df.loc[df['season_name'] == 'Winter', 'period_date'] = df.loc[df['season_name'] == 'Winter', 'period_date'].map(lambda x: x[:5]+'11-30')
# A new data source for a period of 2016-2019(or current) with a crop_production_system of "Commercial (PS)" 
# has overlapped with the old sources including 'CEC, 2016, South Africa', 'CEC, 2017, South Africa'.
# We temporarily select the data points with the latest "collection_date".
df = df.reset_index(drop=True)
cols = ['fnid','product','season_name','season_date','start_date','period_date','indicator','value']
df = df.sort_values(by=['collection_date']).drop_duplicates(cols, keep='last').reset_index(drop=True)
# ----------------------------------------------- #

# FDW Production Data Inspection ---------------- #
df, df_raw = FDW_PD_Sweeper(df, area_priority="Area Planted")
table_dict = FDW_PD_AvalTable(df, shape_all)
# ----------------------------------------------- #

# FEWS NET Shapefile comparison ----------------- #
shape_used = pd.concat([ZA_Admin1_1994], axis=0)
PlotAdminShapes(shape_used, label=True)
# ----------------------------------------------- #

- Remove missing records ---------------------- #
Orignial data points: 6,080
Removed 1,261 "Missing Value" points
2,004/2,012 "Yield" points are retained.
1,410/1,418 "Quantity Produced" points are retained.
0/1,238 "Area Harvested" points are retained.
1,405/1,412 "Area Planted" points are retained.
Current data points: 4,819

- Minor changes are applied ------------------- #

- Basic information --------------------------- #
Data period: 1979 - 2019
5 grain types are found: Barley (Unspecified), Maize Grain (White), Maize Grain (Yellow), Sorghum, Wheat Grain
2 seasons are found: Winter (04-01), Summer (10-01)
2 crop production system are found: none, Commercial (PS)
Data sources include:
[1] RSA DAFF/Directorate, Statistics and Economic Analysis, South Africa --- CEC, South Africa
[2] FEWS NET, South Africa --- Official agricultural statistics, South Africa
[3] FEWS NET, South Africa --- CEC, South Africa
Administrative-1 fnids: 9
Administrative-2 fnids: 0
0 reporting units are foun

## South Africa
Southern Africa crop seasonal calendar

<!-- ![FEWS NET](https://fews.net/sites/default/files/styles/large/public/seasonal-calendar-southern-africa.png?itok=azI0iL6A) -->

![USDA](https://ipad.fas.usda.gov/rssiws/al/crop_calendar/images/safrica_sf_calendar.png)

- In South Africa, basd on the FEWS NET data, there were four times of changes in administrative units: 1980, 1998, 2011, and 2012.

| Year | Admin-1 | # units  | Admin-2  | # units |
| :---: | :----:  | :----:   | :----:   | :---:  |
| 1994  |**`ZA1994A1`** | 9  | nan      | 0 |

- Here, we use **`ZA1994A1`** to represent all admin-level 1 crop data.
- South Africa has two crop seasons: `Summer` and `Winter`.
- South Africa has two crop production system: `none` and `Commercial (PS)`.

In [4]:
# Define the latest shapefile ------------------- #
latest_level = 1
shape_latest = ZA_Admin1_1994.copy().to_crs('epsg:4326')
# ----------------------------------------------- #

# Validation of FNIDs and Names ----------------- #
shape_used['ADMIN2'] = 'None'
shape_all['ADMIN2'] = 'None'
shape_latest['ADMIN2'] = 'None'
df = FDW_PD_ValidateFnidName(df, shape_used, shape_latest)
# ----------------------------------------------- #

# FDW Production Data Compiling ----------------- #
area, prod = FDW_PD_Compiling(df, shape_used)
area_all, prod_all = area.copy(), prod.copy()
mdx_pss = area.columns.droplevel([0,1]).unique()
# ----------------------------------------------- #

In [5]:
# Link admin boundaries ------------------------- #
link, over = FDW_PD_CreateAdminLink(ZA_Admin1_1994, ZA_Admin1_1994, 'ADMIN1', 'ADMIN1', prod, epsg)
# Crop specific ratios
link_ratio = FDW_PD_RatioAdminLink(link, prod, over, mdx_pss)
# Add current unit to link_ratio
for fnid_new in link_ratio.keys():
    link_ratio[fnid_new][fnid_new] = 1.0
    link_ratio[fnid_new] = link_ratio[fnid_new].sort_index(axis=1, ascending=False)
# Connect data with AdminLink
area_new, prod_new = FDW_PD_ConnectAdminLink(link_ratio, area, prod, validation=True)
# ----------------------------------------------- #

In [6]:
# Manual correction ----------------------------- #
crop_new = prod_new/area_new
# Potential typo: 26 -> 1260
area_new.loc[2000,pd.IndexSlice['ZA1994A101',:,'Maize Grain (Yellow)','Summer',:,:,'none']] = 1260
# Potential typo: 31 -> 1310
area_new.loc[2001,pd.IndexSlice['ZA1994A101',:,'Maize Grain (Yellow)','Summer',:,:,'none']] = 1310
# Potential typo: 15450 -> 1204500 (from the removed source)
prod_new.loc[2001,pd.IndexSlice['ZA1994A108',:,'Maize Grain (Yellow)','Summer',:,:,'none']] = 1204500
# Potential typo: 20000 -> 330000 (from the removed source)
area_new.loc[2001,pd.IndexSlice['ZA1994A108',:,'Maize Grain (Yellow)','Summer',:,:,'none']] = 300000
# Potential typo: 25000 -> 972000 (from the removed source)
prod_new.loc[2001,pd.IndexSlice['ZA1994A108',:,'Maize Grain (White)','Summer',:,:,'none']] = 972000
# Potential typo: 50000 -> 270000 (from the removed source)
area_new.loc[2001,pd.IndexSlice['ZA1994A108',:,'Maize Grain (White)','Summer',:,:,'none']] = 270000
# ----------------------------------------------- #

# Complete long format DataFrame ---------------- #
df_area = area_new.T.stack().rename('value').reset_index()
df_area['indicator'] = 'area'
df_prod = prod_new.T.stack().rename('value').reset_index()
df_prod['indicator'] = 'production'
df_yield = (prod_new/area_new).T.stack().rename('value').reset_index()
df_yield['indicator'] = 'yield'
stack = pd.concat([df_area, df_prod, df_yield], axis=0)
# Add "planting year"
cols = ['season_name','product','crop_production_system','planting_month','harvest_year','harvest_month','planting_year']
season_table = df[cols].drop_duplicates()
stack = stack.merge(season_table, on=cols[:-1])
# Add country and admin names
stack = stack.merge(df[['fnid','country','country_code','admin_1','admin_2']].drop_duplicates(), on='fnid', how='inner')
names = [
    'fnid','country','country_code','admin_1','admin_2','name',
    'product','season_name','planting_year','planting_month','harvest_year','harvest_month',
    'crop_production_system','indicator','value'
]
stack_gscd = stack[names]
stack_gscd['gscd_code'] = 'calibrated'
# ----------------------------------------------- #

# Reported FDW data ----------------------------- #
stack_fdw = df[names]
stack_fdw['indicator'] = stack_fdw['indicator'].replace({'Area Harvested':'area','Quantity Produced':'production','Yield':'yield'})
stack_fdw['gscd_code'] = 'reported'
# ----------------------------------------------- #

# Final Processing ------------------------------ #
stack = pd.concat([stack_fdw,stack_gscd], axis=0).reset_index(drop=True)
# Seperate "Maize Grain (White)" and "Maize Grain (Yellow)"
product_category_custom = {
    'Barley (Unspecified)':'Barley',
    'Maize Grain (White)':'Maize (White)',
    'Maize Grain (Yellow)':'Maize (Yellow)',
    'Sorghum':'Sorghum',
    'Wheat Grain':'Wheat'
}
stack['product'] = stack['product'].replace(product_category_custom)
df['product'] = df['product'].replace(product_category_custom)
for f, r in link_ratio.items():
    mdx = pd.MultiIndex.from_frame(r.index.to_frame().reset_index(drop=True).replace(product_category_custom))
    r.index = mdx
    link_ratio[f] = r
# Calibration of planting and Harvest year and season
crop_calendar = {
    'season_name': ['Winter','Spring'],
    'crop_production_system':['rainfed','irrigated'],
    'planting_month':['10-01','03-01'],
    'harvest_month':['07-01','06-01']
}
cs = {
    'Summer': {'planting_month':{'10-01':'10-01'},'harvest_month':{'04-01':'04-01'}},
    'Winter': {'planting_month':{'04-01':'04-01'},'harvest_month':{'11-01':'11-01'}},
}
cy = {'Summer': {'planting_year':-1, 'harvest_year': -1}}
stack, df, link_ratio = FDW_PD_CaliSeasonYear(stack, df, link_ratio, cs, cy)
stack.loc[stack['admin_1'].isna(), 'admin_1'] = 'none'
stack.loc[stack['admin_2'].isna(), 'admin_2'] = 'none'
# # ----------------------------------------------- #

# Save data
save_hdf('./data/crop/adm_crop_production_ZA.hdf', stack)
save_hdf('./data/crop/adm_crop_production_ZA_raw.hdf', df)
save_npz('./data/crop/adm_crop_production_ZA_ratio.npz', link_ratio)

./data/crop/adm_crop_production_ZA.hdf is saved.
./data/crop/adm_crop_production_ZA_raw.hdf is saved.
./data/crop/adm_crop_production_ZA_ratio.npz is saved.


## Visualization of production data

In [7]:
# Bar chart of national grain production
country_iso, country_name = 'ZA', 'South Africa'
df = pd.read_hdf('./data/crop/adm_crop_production_%s.hdf' % country_iso)
df = df[df['gscd_code']=='calibrated']; df['year'] = df['harvest_year']
year = [df['year'].min(), df['year'].max()]
product_order = ['Maize (White)','Maize (Yellow)','Wheat','Sorghum']
for season_name in ['Summer','Winter']:
    footnote = 'National grain production in %s - %s' % (country_name, season_name)
    fn_save = './figures/%s_bar_natgrainprod_%s.png' % (country_iso, season_name)
    sub = df[df['season_name'] == season_name]
    fig = PlotBarProduction(sub, year, product_order, footnote, fn_save)
    fig.show()

./figures/ZA_bar_natgrainprod_Summer.png is saved.


./figures/ZA_bar_natgrainprod_Winter.png is saved.


In [8]:
# Lineplot of Production-Area-Yield (PAY) time-series
country_iso, country_name = 'ZA', 'South Africa'
df = pd.read_hdf('./data/crop/adm_crop_production_%s.hdf' % country_iso)
df = df[df['gscd_code']=='calibrated']; df['year'] = df['harvest_year']
year = [df['year'].min(), df['year'].max()]
product_season = [
    ['Maize (White)','Summer'],
    ['Maize (Yellow)','Summer'],
    ['Wheat','Winter'],
]
for product_name, season_name in product_season:
    footnote = 'Production-Area-Yield (PAY) time-series of %s - %s - %s' % (country_iso, product_name, season_name)
    fn_save = './figures/%s_line_pay_%s_%s.png' % (country_iso, product_name, season_name)
    sub = df[(df['product'] == product_name) & (df['season_name'] == season_name)]
    fig = PlotLinePAY(sub, year, footnote, fn_save)
    fig.show()

./figures/ZA_line_pay_Maize (White)_Summer.png is saved.


./figures/ZA_line_pay_Maize (Yellow)_Summer.png is saved.


./figures/ZA_line_pay_Wheat_Winter.png is saved.


In [9]:
# Calibrated PAY time-series per FNID
from tools_graphic import PlotLineCropTS
country_iso, country_name = 'ZA', 'South Africa'
df = pd.read_hdf('./data/crop/adm_crop_production_%s.hdf' % country_iso)
df = df[df['gscd_code']=='calibrated']; df['year'] = df['harvest_year']
link_ratio = load_npz('./data/crop/adm_crop_production_%s_ratio.npz' % country_iso)
year_all = np.arange(df['year'].min(), df['year'].max()+1)
product_season = [
    ['Maize (White)','Summer'],
    ['Maize (Yellow)','Summer'],
    ['Wheat','Winter'],
]
for product_name, season_name in product_season:
    sub = df[
        (df['product'] == product_name) &
        (df['season_name'] == season_name)
    ]
    for fnid in sub['fnid'].unique():
        sub_fps = sub[sub['fnid'] == fnid]
        fn_save = './figures/crop_calibrated/%s_%s_%s_%s.png' % (country_iso, product_name, season_name, fnid)
        fig = PlotLineCropTS(sub_fps, fnid, product_name, season_name, link_ratio, year_all, fn_save)

./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A101.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A102.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A103.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A104.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A105.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A106.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A107.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A108.png is saved.
./figures/crop_calibrated/ZA_Maize (White)_Summer_ZA1994A109.png is saved.
./figures/crop_calibrated/ZA_Maize (Yellow)_Summer_ZA1994A101.png is saved.
./figures/crop_calibrated/ZA_Maize (Yellow)_Summer_ZA1994A102.png is saved.
./figures/crop_calibrated/ZA_Maize (Yellow)_Summer_ZA1994A103.png is saved.
./figures/crop_calibrated/ZA_Maize (Yellow)_Summer_ZA1994A104.png is saved.
./figures/crop_calibr