In [473]:
import pandas as pd
import json

pd.set_option('display.max_columns', None)
all_df = pd.read_csv('./data/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_All_Data_NOFLAG.csv', encoding="ISO-8859-1", keep_default_na=False, dtype={'Item Code': str})
lat_lon_code = pd.read_csv('./data/world_country_and_usa_states_latitude_and_longitude_values.csv', keep_default_na=False)

lat_lon_code = pd.read_csv('./data/world_country_and_usa_states_latitude_and_longitude_values.csv')
lat_lon_code = lat_lon_code.sort_values(by=['Country']).drop_duplicates(subset=['Alpha-2 code'])
lat_lon_code.loc[:, "Numeric code"] = lat_lon_code["Numeric code"].str.strip(' "').astype(int)
lat_lon_code.loc[:, "Alpha-2 code"] = lat_lon_code["Alpha-2 code"].str.strip(' "')
lat_lon_code.loc[:, "Alpha-3 code"] = lat_lon_code["Alpha-3 code"].str.strip(' "')
lat_lon_code.loc[:, "Latitude (average)"] = lat_lon_code["Latitude (average)"].str.strip(' "').astype(float)
lat_lon_code.loc[:, "Longitude (average)"] = lat_lon_code["Longitude (average)"].str.strip(' "').astype(float)
area_code_to_iso_dict = lat_lon_code[["Numeric code", "Alpha-2 code"]].set_index("Numeric code")['Alpha-2 code'].to_dict()

all_df.loc[:, 'Area Code (M49)'] = all_df['Area Code (M49)'].str.strip(" '").astype(int)
all_df.loc[:, 'ISO-A2'] = all_df['Area Code (M49)'].replace(area_code_to_iso_dict)
all_df = all_df[all_df['ISO-A2'].isin(lat_lon_code['Alpha-2 code'])]

def make_year_cols(start, end):
    return [f'Y{y}' for y in range(start, end)]

In [474]:
all_df = all_df.merge(
    lat_lon_code[['Latitude (average)', 'Longitude (average)', 'Country', 'Alpha-2 code', 'Alpha-3 code']],
    how='left',
    left_on='ISO-A2',
    right_on='Alpha-2 code',
)

all_df = all_df.melt(
    id_vars=["Country", "ISO-A2", "Alpha-3 code", "Longitude (average)", "Latitude (average)", 'Item', 'Item Code', 'Element', 'Unit'], 
    value_vars=make_year_cols(2000, 2021), 
    value_name='Quantity', 
    var_name='Year'
)

all_df['Year'] = all_df['Year'].str.strip('Y').astype(int)
all_df['Quantity'] = all_df['Quantity'].replace({'': 0})
all_df['Quantity'] = all_df['Quantity'].astype(float)
all_df = all_df.rename(columns={"Country": "Area"})
all_df.loc[:, 'Area'] = all_df['Area'].apply(lambda x: x if not x.endswith('of') and not x.endswith('of the') else ' '.join(x.split(', ')[::-1]))

In [475]:
population = pd.read_csv('./data/population_by_country.csv')
population = population.melt(
    id_vars=['Country Name', 'Country Code'],
    value_vars=[str(y) for y in range(2000, 2021)],
    value_name='Population',
    var_name='Year',
)

population['Year'] = population['Year'].astype(int)
all_df = pd.merge(
    all_df, population,
    how='left',
    left_on=['Alpha-3 code', 'Year'],
    right_on=['Country Code', 'Year']
)

In [476]:
continent = pd.read_csv('./data/UNSD — Methodology.csv', sep=';', encoding='utf-8', keep_default_na=False)
all_df = all_df.merge(
    continent[['Region Name', 'ISO-alpha2 Code']],
    how='left',
    left_on='ISO-A2',
    right_on='ISO-alpha2 Code'
).rename(columns={'Region Name': 'Continent'}).drop(columns=['ISO-alpha2 Code'])

# Codes

In [477]:
codes = pd.read_csv('./data/cpc_codes.csv', encoding='ISO-8859-1', dtype={'CPC21code': str})

In [478]:
code_to_type_lookup = {
    '011': 'Cereals',
    '012': 'Vegetables',
    '013': 'Fruits and nuts',
    '211': 'Meat and meat products',
    '22': 'Dairy and egg'
}

manual_code_to_item = {
    '211': ['Meat of camels, fresh or chilled', 'Meat of cattle with the bone, fresh or chilled', 'Meat of pig with the bone, fresh or chilled', 'Meat of asses, fresh or chilled', 'Meat of mules, fresh or chilled', 'Meat of other domestic camelids, fresh or chilled', 'Meat of other domestic rodents, fresh or chilled', 'Meat of pigeons and other birds n.e.c., fresh, chilled or frozen'] + ['Edible offal of cattle, fresh, chilled or frozen', 'Edible offal of goat, fresh, chilled or frozen', 'Edible offal of sheep, fresh, chilled or frozen', 'Edible offal of pigs, fresh, chilled or frozen', 'Edible offal of buffalo, fresh, chilled or frozen'],
    '22': ['Raw milk of camel', 'Raw milk of cattle', 'Raw milk of goats', 'Raw milk of sheep', 'Raw milk of buffalo', 'Hen eggs in shell, fresh', 'Eggs from other birds in shell, fresh, n.e.c.'],
}

In [479]:
for code, code_type in code_to_type_lookup.items():
    if code in manual_code_to_item:
        all_df.loc[all_df['Item'].isin(manual_code_to_item[code]), 'Item Hierarchy Type'] = code_type
    else:
        all_item_types = codes[codes['CPC21code'].str.startswith(code)]['CPC21title']
        all_df.loc[all_df['Item'].isin(all_item_types), 'Item Hierarchy Type'] = code_type

In [502]:
all_df[all_df['Item Hierarchy Type'] == 'Dairy and egg']['Element'].unique()

array(['Yield', 'Production', 'Laying', 'Milk Animals'], dtype=object)

# check country name

In [24]:
with open('./data/ne_110m.json', 'r') as f:
    text = f.read()

In [None]:
for country_name in all_df['Area'].unique():
    if text.find('"NAME":"{}"'.format(country_name)) == -1:
        print(country_name)


# wrangle file

In [506]:
select_columns = [
    'Area',
    'ISO-A2',
    'Item',
    "Quantity",
    "Unit",
    'Latitude (average)',
    'Longitude (average)',
    'Continent',
    'Year',
    'Population',
    'Item Hierarchy Type'
]

remove_if_null_in_subset = ["ISO-A2", "Continent", "Quantity", "Item Hierarchy Type"]

In [507]:
df = all_df[
    (all_df['Element'].isin(['Production']))
].dropna(
  axis=0,
  how='any',
  subset=remove_if_null_in_subset
)[select_columns]

df.to_csv('data/processed/production_major_categories_per_country_2000-2020.csv', index=False)

In [508]:
df = all_df[
    (all_df['Element'].str.startswith('Yield'))
].dropna(
  axis=0,
  how='any',
  subset=remove_if_null_in_subset
)[select_columns]

df.to_csv('data/processed/yield_major_categories_per_country_2000-2020.csv', index=False)

In [509]:
df = all_df[
    (all_df["Element"] == "Production")
].dropna(
  axis=0,
  how='any',
  subset=remove_if_null_in_subset,
).groupby(["Item", "Year", "Unit"])[["Quantity"]]\
 .sum()\
 .reset_index()

df.to_csv('./data/processed/production_all_category_2000_2020.csv', index=False)