In [46]:
import pandas as pd

pd.set_option('display.max_columns', None)
all_df = pd.read_csv('./data/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_All_Data_NOFLAG.csv', encoding="ISO-8859-1")
area_code_df = pd.read_csv('./data/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_AreaCodes.csv')

code_to_name_dict = area_code_df[['Area Code', 'Area']].set_index('Area Code').to_dict()['Area']
all_df.loc[:, 'Area'] = all_df['Area Code'].replace(code_to_name_dict)
lat_lon = pd.read_csv('./data/world_country_and_usa_states_latitude_and_longitude_values.csv')
population = pd.read_csv('./data/population_by_country.csv')

def make_year_cols(start, end):
    return [f'Y{y}' for y in range(start, end)]

In [47]:
all_df = all_df.merge(
    lat_lon[['latitude', 'longitude', 'country']],
    how='left',
    left_on='Area',
    right_on='country',
)

all_df = all_df.melt(
    id_vars=['Area', 'Area Code', "longitude", "latitude", 'Item', 'Item Code', 'Element', 'Unit'], 
    value_vars=make_year_cols(2000, 2021), 
    value_name='Quantity', 
    var_name='Year'
)

all_df['Year'] = all_df['Year'].str.strip('Y').astype(int)

In [48]:
population = population.melt(
    id_vars=['Country Name'],
    value_vars=[str(y) for y in range(2000, 2021)],
    value_name='Population',
    var_name='Year',
)

population['Year'] = population['Year'].astype(int)

In [49]:
all_df = pd.merge(
    all_df, population,
    how='left',
    left_on=['Area', 'Year'],
    right_on=['Country Name', 'Year']
)

# check country name

In [24]:
with open('./data/ne_110m.json', 'r') as f:
    text = f.read()

In [None]:
for country_name in all_df['Area'].unique():
    if text.find('"NAME":"{}"'.format(country_name)) == -1:
        print(country_name)


# wrangle file

In [38]:
df = all_df[
    (all_df['Item'].str.endswith('Total')) &\
    (all_df['Area Code'] <= 300) &\
    (all_df['Element'].isin(['Production']))
].dropna(
  axis=0,
  how='any',
  subset=['latitude', 'longitude', 'Population']
)

df.to_csv('data/processed/production_major_categories_per_country_2000-2020.csv')

In [None]:
df = all_df[
    (all_df['Item'].str.endswith('Total')) &\
    (all_df['Area Code'] <= 300) &\
    (all_df['Element'].isin(['Yield']))
].dropna(
  axis=0,
  how='any',
  subset=['latitude', 'longitude', 'Population']
)

df.to_csv('data/processed/yield_major_categories_per_country_2000-2020.csv')

In [44]:
df = all_df[
    (~all_df['Item'].str.endswith('Total')) &\
    (all_df["Element"] == "Production")
].dropna(
  axis=0,
  how='any',
  subset=['latitude', 'longitude', 'Population']
).groupby(["Item", "Year", "Unit"])[["Quantity"]]\
 .sum()\
 .reset_index()

df.to_csv('./data/processed/production_all_category_2000_2020.csv')

In [61]:
f = all_df['Item Code'].astype(str).str.slice(0, 1)
all_df['first code digit'] = f 


Unnamed: 0,first code digit,Item
49381,1,Butter of goat milk
202,1,"Citrus Fruit, Total"
201,1,Cheese (All Kinds)
198,1,"Cereals, Total"
197,1,Cattle and Buffaloes
...,...,...
26,9,"Cheese from milk of sheep, fresh or processed"
169,9,"Sheep fat, unrendered"
16,9,Butter and ghee of sheep milk
5859,9,Quinoa


In [64]:
all_df[all_df['first code digit'] == '1']['Item'].unique()

array(['Asses', 'Camels', 'Cheese from milk of goats, fresh or processed',
       'Chickens', 'Edible offal of goat, fresh, chilled or frozen',
       'Edible offals of camels and other camelids, fresh, chilled or frozen',
       'Fat of camels', 'Game meat, fresh, chilled or frozen',
       'Goat fat, unrendered', 'Goats', 'Hen eggs in shell, fresh',
       'Horses', 'Meat of camels, fresh or chilled',
       'Meat of chickens, fresh or chilled',
       'Meat of goat, fresh or chilled', 'Molasses', 'Mules and hinnies',
       'Natural honey', 'Potatoes',
       'Raw cane or beet sugar (centrifugal only)',
       'Raw hides and skins of goats or kids', 'Raw milk of camel',
       'Raw milk of goats', 'Raw silk (not thrown)',
       'Silk-worm cocoons suitable for reeling', 'Sugar beet',
       'Sugar cane', 'Wheat', 'Beef and Buffalo Meat', 'Butter and Ghee',
       'Cattle and Buffaloes', 'Cereals, Total', 'Cheese (All Kinds)',
       'Citrus Fruit, Total', 'Eggs Primary', 'Fruit Prim

In [None]:
1) CROPS PRIMARY:
['Cereals',' Citrus Fruit',' Fibre Crops', 'Fruit', 'Oil Crops', 'Oil Crops and Cakes in Oil Equivalent',' Pulses',' Roots and Tubers',' Sugar Crops',' Treenuts and Vegetables']
2) CROPS PROCESSED:
Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.
3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.
4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy.
5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt

In [69]:
all_df[all_df['Item'].str.endswith('Primary')]["Item"].unique()

array(['Eggs Primary', 'Fruit Primary', 'Sugar Crops Primary',
       'Vegetables Primary'], dtype=object)