In [297]:
import pandas as pd
import json

pd.set_option('display.max_columns', None)
all_df = pd.read_csv('./data/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_All_Data_NOFLAG.csv', encoding="ISO-8859-1", keep_default_na=False)
lat_lon_code = pd.read_csv('./data/world_country_and_usa_states_latitude_and_longitude_values.csv', keep_default_na=False)

lat_lon_code = pd.read_csv('./data/world_country_and_usa_states_latitude_and_longitude_values.csv')
lat_lon_code = lat_lon_code.sort_values(by=['Country']).drop_duplicates(subset=['Alpha-2 code'])
lat_lon_code.loc[:, "Numeric code"] = lat_lon_code["Numeric code"].str.strip(' "').astype(int)
lat_lon_code.loc[:, "Alpha-2 code"] = lat_lon_code["Alpha-2 code"].str.strip(' "')
lat_lon_code.loc[:, "Alpha-3 code"] = lat_lon_code["Alpha-3 code"].str.strip(' "')
lat_lon_code.loc[:, "Latitude (average)"] = lat_lon_code["Latitude (average)"].str.strip(' "').astype(float)
lat_lon_code.loc[:, "Longitude (average)"] = lat_lon_code["Longitude (average)"].str.strip(' "').astype(float)
area_code_to_iso_dict = lat_lon_code[["Numeric code", "Alpha-2 code"]].set_index("Numeric code")['Alpha-2 code'].to_dict()

all_df.loc[:, 'Area Code (M49)'] = all_df['Area Code (M49)'].str.strip(" '").astype(int)
all_df.loc[:, 'ISO-A2'] = all_df['Area Code (M49)'].replace(area_code_to_iso_dict)
all_df = all_df[all_df['ISO-A2'].isin(lat_lon_code['Alpha-2 code'])]

def make_year_cols(start, end):
    return [f'Y{y}' for y in range(start, end)]

In [298]:
all_df = all_df.merge(
    lat_lon_code[['Latitude (average)', 'Longitude (average)', 'Country', 'Alpha-2 code', 'Alpha-3 code']],
    how='left',
    left_on='ISO-A2',
    right_on='Alpha-2 code',
)

all_df = all_df.melt(
    id_vars=["Country", "ISO-A2", "Alpha-3 code", "Longitude (average)", "Latitude (average)", 'Item', 'Item Code', 'Element', 'Unit'], 
    value_vars=make_year_cols(2000, 2021), 
    value_name='Quantity', 
    var_name='Year'
)

all_df['Year'] = all_df['Year'].str.strip('Y').astype(int)
all_df = all_df.rename(columns={"Country": "Area"})
all_df.loc[:, 'Area'] = all_df['Area'].apply(lambda x: x if not x.endswith('of') and not x.endswith('of the') else ' '.join(x.split(', ')[::-1]))

In [299]:
population = pd.read_csv('./data/population_by_country.csv')
population = population.melt(
    id_vars=['Country Name', 'Country Code'],
    value_vars=[str(y) for y in range(2000, 2021)],
    value_name='Population',
    var_name='Year',
)

population['Year'] = population['Year'].astype(int)
all_df = pd.merge(
    all_df, population,
    how='left',
    left_on=['Alpha-3 code', 'Year'],
    right_on=['Country Code', 'Year']
)

In [305]:
continent = pd.read_csv('./data/UNSD — Methodology.csv', sep=';', encoding='utf-8', keep_default_na=False)
all_df = all_df.merge(
    continent[['Region Name', 'ISO-alpha2 Code']],
    how='left',
    left_on='ISO-A2',
    right_on='ISO-alpha2 Code'
).rename(columns={'Region Name': 'Continent'}).drop(columns=['ISO-alpha2 Code'])

In [306]:
all_df

Unnamed: 0,Area,ISO-A2,Alpha-3 code,Longitude (average),Latitude (average),Item,Item Code,Element,Unit,Year,Quantity,Country Name,Country Code,Population,Continent
0,Afghanistan,AF,AFG,65.0,33.0,"Almonds, in shell",221,Area harvested,ha,2000,7000.000000,Afghanistan,AFG,20779957.0,Asia
1,Afghanistan,AF,AFG,65.0,33.0,"Almonds, in shell",221,Yield,hg/ha,2000,17143.000000,Afghanistan,AFG,20779957.0,Asia
2,Afghanistan,AF,AFG,65.0,33.0,"Almonds, in shell",221,Production,tonnes,2000,12000.000000,Afghanistan,AFG,20779957.0,Asia
3,Afghanistan,AF,AFG,65.0,33.0,"Anise, badian, coriander, cumin, caraway, fenn...",711,Area harvested,ha,2000,4000.000000,Afghanistan,AFG,20779957.0,Asia
4,Afghanistan,AF,AFG,65.0,33.0,"Anise, badian, coriander, cumin, caraway, fenn...",711,Yield,hg/ha,2000,6250.000000,Afghanistan,AFG,20779957.0,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190611,Zimbabwe,ZW,ZWE,30.0,-20.0,"Treenuts, Total",1729,Yield,hg/ha,2020,20832.000000,Zimbabwe,ZWE,14862927.0,Africa
1190612,Zimbabwe,ZW,ZWE,30.0,-20.0,"Treenuts, Total",1729,Production,tonnes,2020,7064.000000,Zimbabwe,ZWE,14862927.0,Africa
1190613,Zimbabwe,ZW,ZWE,30.0,-20.0,Vegetables Primary,1735,Area harvested,ha,2020,33744.000000,Zimbabwe,ZWE,14862927.0,Africa
1190614,Zimbabwe,ZW,ZWE,30.0,-20.0,Vegetables Primary,1735,Yield,hg/ha,2020,67711.000000,Zimbabwe,ZWE,14862927.0,Africa


# check country name

In [24]:
with open('./data/ne_110m.json', 'r') as f:
    text = f.read()

In [None]:
for country_name in all_df['Area'].unique():
    if text.find('"NAME":"{}"'.format(country_name)) == -1:
        print(country_name)


In [307]:
all_df.isna().sum()

Area                       0
ISO-A2                     0
Alpha-3 code               0
Longitude (average)        0
Latitude (average)         0
Item                       0
Item Code                  0
Element                    0
Unit                       0
Year                       0
Quantity                   0
Country Name           32613
Country Code           32613
Population             34179
Continent                  0
dtype: int64

# wrangle file

In [308]:
select_columns = [
    'Area',
    'ISO-A2',
    'Item',
    "Quantity",
    "Unit",
    'Latitude (average)',
    'Longitude (average)',
    'Continent',
    'Year',
    'Population'
]

In [309]:
df = all_df[
    (all_df['Item'].str.endswith('Total')) &\
    (all_df['Element'].isin(['Production']))
].dropna(
  axis=0,
  how='any',
  subset=["ISO-A2", "Continent", "Quantity"]
)[select_columns]

df.to_csv('data/processed/production_major_categories_per_country_2000-2020.csv', index=False)

In [310]:
df = all_df[
    (all_df['Item'].str.endswith('Total')) &\
    (all_df['Element'].isin(['Yield']))
].dropna(
  axis=0,
  how='any',
  subset=["ISO-A2", "Continent", "Quantity"]
)[select_columns]

df.to_csv('data/processed/yield_major_categories_per_country_2000-2020.csv', index=False)

In [311]:
df = all_df[
    (~all_df['Item'].str.endswith('Total')) &\
    (all_df["Element"] == "Production")
].dropna(
  axis=0,
  how='any',
).groupby(["Item", "Year", "Unit"])[["Quantity"]]\
 .sum()\
 .reset_index()

df.to_csv('./data/processed/production_all_category_2000_2020.csv', index=False)

In [None]:
1) CROPS PRIMARY:
['Cereals',' Citrus Fruit',' Fibre Crops', 'Fruit', 'Oil Crops', 'Oil Crops and Cakes in Oil Equivalent',' Pulses',' Roots and Tubers',' Sugar Crops',' Treenuts and Vegetables']
2) CROPS PROCESSED:
Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.
3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.
4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy.
5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt

In [57]:
import json
with open('./data/ne_110m_admin_0_countries.topojson', 'r', encoding="ISO-8859-1") as f:
    x = json.load(f)

In [58]:
f = [geom['properties']['NAME'] for geom in x['objects']['ne_110m_admin_0_countries']['geometries']]

In [59]:
set(f).difference(set(all_df["Area"]))

{'Antarctica',
 "CÃ´te d'Ivoire",
 'Falkland Is.',
 'Fr. S. Antarctic Lands',
 'Greenland',
 'Kosovo',
 'Macedonia',
 'N. Cyprus',
 'S. Sudan',
 'Somaliland',
 'W. Sahara',
 'eSwatini'}

In [45]:
set(f)

ValueError: I/O operation on closed file.

In [69]:
all_df[all_df['Item'].str.endswith('Primary')]["Item"].unique()

array(['Eggs Primary', 'Fruit Primary', 'Sugar Crops Primary',
       'Vegetables Primary'], dtype=object)