# FDW Crop Production Data Analysis - Zimbabwe

## Import an auxiliary data

In [9]:
import numpy as np
import pandas as pd
import geopandas as gpd
df = pd.read_excel('../data/auxiliary/zimbabwe/GSCD_Zimbabwe_Maize_Production.xlsx', sheet_name='Sheet1', header=[0,1], index_col=0).T
area, prod = df.loc['Area'], df.loc['Production']
assert all(area.isna() == prod.isna())
crop = prod / area
area = area.stack().reset_index()
area.columns = ['year', 'province', 'value']
area['indicator'] = 'Harvested Area'
prod = prod.stack().reset_index()
prod.columns = ['year', 'province', 'value']
prod['indicator'] = 'Quantity Produced'
crop = crop.stack().reset_index()
crop.columns = ['year', 'province', 'value']
crop['indicator'] = 'Yield'
df = pd.concat([area, prod, crop], axis=0)
# Exclude total
df = df[df['province'] != 'Total']
# Modify years
df['harvest_year'] = df['year'].apply(lambda x: int(x.split('-')[1]))
df['harvest_month'] = 6
df['planting_year'] = df['harvest_year'] - 1
df['planting_month'] = 11
# FNID and country names
fnid_dict = {
    'Mashonaland West': 'ZW2011A114', 
    'Mashonaland Central' : 'ZW2011A112', 
    'Mashonaland East': 'ZW2011A113',
    'Manicaland' : 'ZW2011A111', 
    'Midlands' : 'ZW2011A117', 
    'Masvingo' : 'ZW2011A118', 
    'Matabeleland North' : 'ZW2011A115',
    'Matabeleland South' : 'ZW2011A116'
}
df['fnid'] = df['province'].apply(lambda x: fnid_dict[x])
df['country'] = 'Zimbabwe'
df['country_code'] = 'ZW'
df = df.rename(columns = {'province': 'admin_1'})
df['admin_2'] = np.nan
df['name'] = df['admin_2']
# Other variables
df['product'] = 'Maize'
df['season_name'] = 'Main'
df['crop_production_system'] = 'none'
df['gscd_code'] = 'calibrated'
# Organize columns
df = df[['fnid','country','country_code','admin_1','admin_2','name','product','season_name','planting_year','planting_month','harvest_year','harvest_month','crop_production_system','indicator','value','gscd_code']]


Unnamed: 0,fnid,country,country_code,admin_1,admin_2,name,product,season_name,planting_year,planting_month,harvest_year,harvest_month,crop_production_system,indicator,value,gscd_code
0,ZW2011A114,Zimbabwe,ZW,Mashonaland West,,,Maize,Main,2000,11,2001,6,none,Harvested Area,198919.000000,calibrated
1,ZW2011A112,Zimbabwe,ZW,Mashonaland Central,,,Maize,Main,2000,11,2001,6,none,Harvested Area,140175.000000,calibrated
2,ZW2011A113,Zimbabwe,ZW,Mashonaland East,,,Maize,Main,2000,11,2001,6,none,Harvested Area,220136.000000,calibrated
3,ZW2011A111,Zimbabwe,ZW,Manicaland,,,Maize,Main,2000,11,2001,6,none,Harvested Area,158629.000000,calibrated
4,ZW2011A117,Zimbabwe,ZW,Midlands,,,Maize,Main,2000,11,2001,6,none,Harvested Area,209348.000000,calibrated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,ZW2011A111,Zimbabwe,ZW,Manicaland,,,Maize,Main,2021,11,2022,6,none,Yield,0.573631,calibrated
175,ZW2011A117,Zimbabwe,ZW,Midlands,,,Maize,Main,2021,11,2022,6,none,Yield,0.422668,calibrated
176,ZW2011A118,Zimbabwe,ZW,Masvingo,,,Maize,Main,2021,11,2022,6,none,Yield,0.324001,calibrated
177,ZW2011A115,Zimbabwe,ZW,Matabeleland North,,,Maize,Main,2021,11,2022,6,none,Yield,0.230146,calibrated


### FAO

In [9]:
import pandas as pd
# Load FAO-STAT National Production Data ------------- #
tmp = pd.read_csv('../data/crop/adm_fao_stat.csv', index_col=0)
data_fao = tmp[
    (tmp['cnt_name'] == 'Zimbabwe') &
    (tmp['cpc2_name'] == 'Maize (corn)') &
    (tmp['indicator'] == 'Production')
].set_index('year')['value']
# ---------------------------------------------------- #
# data = pd.concat([data_fao, data_fdw], axis=1, keys=['FAO','GSCD']).sort_index()
# data.reindex(np.array(range(1961,2024)))

## Report 1: Duplicates in the data

In [4]:
import json, requests
import pandas as pd
host = 'https://fdw.fews.net'
auth = tuple(json.loads(open('token.json', "r").read()))
parameters = {
    'format': 'json',
    'country': 'Zimbabwe',
    'product': 'R011',
    'survey_type': 'crop:best'
}
endpoint = '/api/cropproductionindicatorvalue/'
response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
response.raise_for_status()
df = pd.DataFrame.from_records(response.json())

In [5]:
df['product'].unique()

array(['Rice (Paddy)', 'Maize Grain (White)', 'Millet (Pearl)',
       'Millet (Finger)', 'Sorghum'], dtype=object)

In [9]:
df[
    (df['indicator'] == 'Quantity Produced') & 
    (df['product'] == 'Maize Grain (White)')
].pivot_table(index='fnid', columns='season_year', values='value', aggfunc='count')

season_year,Main harvest 2014,Main harvest 2015,Main harvest 2016,Main harvest 2018,Main harvest 2019,Main harvest 2020,Main harvest 2021
fnid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ZW2011A21021,,1.0,,,,,
ZW2011A21101,1.0,1.0,2.0,1.0,1.0,1.0,1.0
ZW2011A21102,1.0,1.0,2.0,1.0,1.0,1.0,1.0
ZW2011A21103,1.0,1.0,2.0,,,1.0,1.0
ZW2011A21104,1.0,1.0,2.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
ZW2011A21803,1.0,1.0,2.0,1.0,1.0,1.0,1.0
ZW2011A21804,1.0,1.0,2.0,1.0,1.0,1.0,1.0
ZW2011A21805,1.0,1.0,2.0,,,1.0,1.0
ZW2011A21806,1.0,1.0,2.0,1.0,1.0,1.0,1.0


In [8]:
df[
    (df['indicator'] == 'Quantity Produced') & 
    (df['product'] == 'Maize Grain (White)')
].pivot_table(index='season_year', columns='publication_name', values='value', aggfunc='count')

publication_name,"Ministry of Agriculture, Estimation Division, 2014, Zimbabwe","Ministry of Agriculture, Estimation Division, 2015, Zimbabwe","Ministry of Agriculture, Estimation Division, 2016, Zimbabwe","Ministry of Agriculture, Estimation Division, 2017, Zimbabwe","Ministry of Agriculture, Estimation Division, 2020, Zimbabwe","Ministry of Agriculture, Estimation Division, 2021, Zimbabwe","Second Round Crop and Livestock Assessment Report 2017/2018 season, Zimbabwe"
season_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Main harvest 2014,59.0,,,,,,
Main harvest 2015,,62.0,,,,,
Main harvest 2016,,,60.0,61.0,,,
Main harvest 2018,,,,,,,49.0
Main harvest 2019,,,,,,,50.0
Main harvest 2020,,,,,61.0,,
Main harvest 2021,,,,,,61.0,


## Report 1: Year-off problem

In [1]:
# Reproduce the year-off problem in FDW-ZA crop data
import json, requests
import pandas as pd
host = 'https://fdw.fews.net'
auth = tuple(json.loads(open('token.json', "r").read()))
parameters = {
    'format': 'json',
    'country': 'South Africa',
    'product': 'R011',
    'survey_type': 'crop:best'
}
endpoint = '/api/cropproductionindicatorvalue/'
response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
response.raise_for_status()
df = pd.DataFrame.from_records(response.json())
# Remove national level values
df = df[df['fnid'] != 'ZA1994A0'].reset_index(drop=True)
# Table of "Maize Grain (Yellow)"
sub = df[
    (df['season_name'] == 'Summer') &
    (df['product'].isin(['Maize Grain (Yellow)'])) &
    (df['indicator'] == 'Quantity Produced')
]
sub.pivot_table(
    index=['start_date','period_date','season_name','season_type','season_date','season_year'],
    columns='product',values='value',aggfunc='sum'
).tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,product,Maize Grain (Yellow)
start_date,period_date,season_name,season_type,season_date,season_year,Unnamed: 6_level_1
2017-02-01,2017-02-28,Summer,Harvest,2016-10-01,Summer 2016,4283100.0
2017-04-01,2017-04-30,Summer,Harvest,2016-10-01,Summer 2016,4283100.0
2018-04-01,2018-04-30,Summer,Harvest,2017-10-01,Summer 2017,6904000.0
2019-04-01,2019-04-30,Summer,Harvest,2018-10-01,Summer 2018,6129650.0
2020-04-01,2020-04-30,Summer,Harvest,2019-10-01,Summer 2019,5648080.0


## Report 2: Duplication problem 

In [2]:
# Reproduce the year-off problem in FDW-ZA crop data
import json, requests
import pandas as pd
host = 'https://fdw.fews.net'
auth = tuple(json.loads(open('token.json', "r").read()))
parameters = {
    'format': 'json',
    'country': 'South Africa',
    'product': 'R011',
    'survey_type': 'crop:best'
}
endpoint = '/api/cropproductionindicatorvalue/'
response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
response.raise_for_status()
df = pd.DataFrame.from_records(response.json())
# Remove national level values
df = df[df['fnid'] != 'ZA1994A0'].reset_index(drop=True)
# Add "None" to undefined [publication_name]
df.loc[df['publication_name'] == '','publication_name'] = 'None'
# Table of "Maize Grain (Yellow)"
sub = df[
    (df['season_name'] == 'Summer') &
    (df['product'].isin(['Maize Grain (Yellow)'])) &
    (df['indicator'] == 'Quantity Produced')
]
sub.pivot_table(
    index=['start_date','period_date','season_date','season_year'],
    columns=['product','publication_name'],values='value',aggfunc='sum'
).tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,product,Maize Grain (Yellow),Maize Grain (Yellow),Maize Grain (Yellow),Maize Grain (Yellow),Maize Grain (Yellow)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,publication_name,"CEC, 2001, South Africa","CEC, 2016, South Africa","FEWS NET, South Africa",None,"Official Database, 2016, South Africa"
start_date,period_date,season_date,season_year,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2017-02-01,2017-02-28,2016-10-01,Summer 2016,,4283100.0,,,
2017-04-01,2017-04-30,2016-10-01,Summer 2016,,,,4283100.0,
2018-04-01,2018-04-30,2017-10-01,Summer 2017,,,,6904000.0,
2019-04-01,2019-04-30,2018-10-01,Summer 2018,,,,6129650.0,
2020-04-01,2020-04-30,2019-10-01,Summer 2019,,,,5648080.0,


## Report 3: Duplication between products

In [3]:
# Reproduce the year-off problem in FDW-ZA crop data
import json, requests
import pandas as pd
host = 'https://fdw.fews.net'
auth = tuple(json.loads(open('token.json', "r").read()))
parameters = {
    'format': 'json',
    'country': 'South Africa',
    'product': 'R011',
    'survey_type': 'crop:best'
}
endpoint = '/api/cropproductionindicatorvalue/'
response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
response.raise_for_status()
df = pd.DataFrame.from_records(response.json())
# Remove national level values
df = df[df['fnid'] != 'ZA1994A0'].reset_index(drop=True)
# Table of all products
sub = df[
    (df['season_name'] == 'Summer') &
    (df['indicator'] == 'Quantity Produced')
]
sub.pivot_table(
    index=['start_date','period_date','season_name','season_type','season_date','season_year'],
    columns='product',values='value',aggfunc='sum'
).tail(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,product,Maize Grain (White),Maize Grain (Yellow),Sorghum,Wheat Grain
start_date,period_date,season_name,season_type,season_date,season_year,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2007-02-01,2007-02-28,Summer,Harvest,2006-10-01,Summer 2006,4038900.0,2430600.0,,96000.0
2007-04-01,2007-04-30,Summer,Harvest,2006-10-01,Summer 2006,4038900.0,2430600.0,96000.0,
2008-02-01,2008-02-29,Summer,Harvest,2007-10-01,Summer 2007,4315000.0,2810000.0,,169700.0
2008-04-01,2008-04-30,Summer,Harvest,2007-10-01,Summer 2007,4315000.0,2810000.0,169700.0,
2009-02-01,2009-02-28,Summer,Harvest,2008-10-01,Summer 2008,7480000.0,5220000.0,,255000.0
2009-04-01,2009-04-30,Summer,Harvest,2008-10-01,Summer 2008,7480000.0,5220000.0,255000.0,
2010-02-01,2010-02-28,Summer,Harvest,2009-10-01,Summer 2009,6775000.0,5275000.0,,276500.0
2010-04-01,2010-04-30,Summer,Harvest,2009-10-01,Summer 2009,6775000.0,5275000.0,276500.0,
2011-02-01,2011-02-28,Summer,Harvest,2010-10-01,Summer 2010,7830000.0,4985000.0,,196500.0
2011-04-01,2011-04-30,Summer,Harvest,2010-10-01,Summer 2010,7830000.0,4985000.0,196500.0,


In [4]:
# # FDW API host address -------------------------- #
# host = 'https://fdw.fews.net'
# auth = tuple(json.loads(open('token.json', "r").read()))
# parameters = {
#     'format': 'json',
#     'country': 'South Africa',
#     'product': 'R011',
#     'survey_type': 'crop:best'
# }
# endpoint = '/api/cropproductionindicatorvalue/'
# response = requests.get(host + endpoint, auth=auth, params=parameters, proxies={})
# response.raise_for_status()
# df = pd.DataFrame.from_records(response.json())
# df_origin = df.copy()
# # ----------------------------------------------- #

# # FDW raw data
# df = df_origin.copy()
# df = df[df['fnid'] != 'ZA1994A0'].reset_index(drop=True)
# df = df[
#     (df['status'] == 'Collected') &
#     (df['season_name'] == 'Summer') &
#     (df['indicator'] == 'Quantity Produced') &
#     (df['product'].isin(['Maize Grain (White)', 'Maize Grain (Yellow)']))
# ]
# # Drop duplicated values by selecting values having more recent "collection_date".
# cols = ['fnid','product','season_name','season_date','indicator','value']
# df = df.sort_values(by=['collection_date']).drop_duplicates(cols, keep='last').reset_index(drop=True)
# df['year'] = pd.to_datetime(df['season_date']).dt.year
# data = df.pivot_table(index='year',columns='product',values='value',aggfunc='sum').sum(1)
# years = np.array(data.index)

## Validation with FAO data and CHIRPS

In [5]:
# Load GSCD crop data
country_iso, country_name = 'ZA', 'South Africa'
df = pd.read_hdf('../data/crop/adm_crop_production_%s.hdf' % country_iso)
df = df[df['gscd_code']=='calibrated']; fnids = list(df['fnid'].unique())
data_fdw = df[
    (df['season_name'] == 'Summer') &
    (df['product'].isin(['Maize (White)','Maize (Yellow)'])) &
    (df['indicator'] == 'production')
]
data_fdw['year'] = data_fdw['harvest_year']
data_fdw = data_fdw.pivot_table(index='year',columns='product',values='value',aggfunc='sum')
data_fdw.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fdw['year'] = data_fdw['harvest_year']


product,Maize (White),Maize (Yellow)
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,4735000.0,5220000.0
2016,3253775.0,4283100.0
2017,9916000.0,6904000.0
2018,6759800.0,6129650.0
2019,5538240.0,5648080.0
