# Cocoa Investigation

In [26]:
# import dependencies
import pandas as pd
import numpy as np


In [27]:
# filepath to data
filename = '../Resources/crop_livestock_production.csv'
# read in table
area_data = pd.read_csv(filename)

In [28]:
# explore data
area_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14020 entries, 0 to 14019
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Domain Code       14020 non-null  object 
 1   Domain            14020 non-null  object 
 2   Area Code         14020 non-null  int64  
 3   Area              14020 non-null  object 
 4   Element Code      14020 non-null  int64  
 5   Element           14020 non-null  object 
 6   Item Code         14020 non-null  int64  
 7   Item              14020 non-null  object 
 8   Year Code         14020 non-null  int64  
 9   Year              14020 non-null  int64  
 10  Unit              14020 non-null  object 
 11  Value             13711 non-null  float64
 12  Flag              10539 non-null  object 
 13  Flag Description  14020 non-null  object 
dtypes: float64(1), int64(5), object(8)
memory usage: 1.5+ MB


In [29]:
area_data.head()

Unnamed: 0,Domain Code,Domain,Area Code,Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,QCL,Crops and livestock products,7,Angola,5312,Area harvested,661,"Cocoa, beans",1961,1961,ha,1600.0,F,FAO estimate
1,QCL,Crops and livestock products,7,Angola,5312,Area harvested,661,"Cocoa, beans",1962,1962,ha,1600.0,F,FAO estimate
2,QCL,Crops and livestock products,7,Angola,5312,Area harvested,661,"Cocoa, beans",1963,1963,ha,1400.0,F,FAO estimate
3,QCL,Crops and livestock products,7,Angola,5312,Area harvested,661,"Cocoa, beans",1964,1964,ha,1400.0,F,FAO estimate
4,QCL,Crops and livestock products,7,Angola,5312,Area harvested,661,"Cocoa, beans",1965,1965,ha,1400.0,F,FAO estimate


In [30]:
columns = area_data.columns
columns

Index(['Domain Code', 'Domain', 'Area Code', 'Area', 'Element Code', 'Element',
       'Item Code', 'Item', 'Year Code', 'Year', 'Unit', 'Value', 'Flag',
       'Flag Description'],
      dtype='object')

Columns to be dropped - Domain Code, Domain, Item Code, Item, Year Code, Flag, Flag Description, Area Code

Info on other columns:
Element Codes
5312 Area harvested (ha)
5419 Yield (hg/ha)
5510 Production (tonnes)

In [31]:
production = area_data.drop(['Domain Code', 'Domain', 'Area Code', 'Element Code',
       'Item Code', 'Item', 'Year Code', 'Flag',
       'Flag Description'], axis = 1)

In [32]:
production.head()

Unnamed: 0,Area,Element,Year,Unit,Value
0,Angola,Area harvested,1961,ha,1600.0
1,Angola,Area harvested,1962,ha,1600.0
2,Angola,Area harvested,1963,ha,1400.0
3,Angola,Area harvested,1964,ha,1400.0
4,Angola,Area harvested,1965,ha,1400.0


In [33]:
# Split production table into 3 tables, one for each Element

# first table
area_harvested = production.loc[production['Element'] == "Area harvested"]

In [34]:
# drop unnecessary column
area_harvested = area_harvested.drop('Element', axis = 1)

In [35]:
# re-label columns
area_harvested.columns = ['Country', 'Year', 'Area_unit', 'Area_harvested']

In [36]:
# repeat for other columns
prod = production.loc[production['Element'] == "Production"]
prod = prod.drop('Element', axis = 1)
prod.columns = ['Country', 'Year', 'Production_unit', 'Production']

p_yield = production.loc[production['Element'] == "Yield"]
p_yield = p_yield.drop('Element', axis = 1)
p_yield.columns = ['Country', 'Year', 'Yield_unit', 'Yield']

In [37]:
# combine tables
df1 = pd.merge(area_harvested, prod, how = "outer", on=["Country", "Year"])
country_production = pd.merge(df1, p_yield, how = "outer", on=["Country", "Year"])
country_production


Unnamed: 0,Country,Year,Area_unit,Area_harvested,Production_unit,Production,Yield_unit,Yield
0,Angola,1961,ha,1600.0,tonnes,400.0,hg/ha,2500.0
1,Angola,1962,ha,1600.0,tonnes,400.0,hg/ha,2500.0
2,Angola,1963,ha,1400.0,tonnes,300.0,hg/ha,2143.0
3,Angola,1964,ha,1400.0,tonnes,300.0,hg/ha,2143.0
4,Angola,1965,ha,1400.0,tonnes,400.0,hg/ha,2857.0
...,...,...,...,...,...,...,...,...
4750,Benin,1987,,,tonnes,70.0,,
4751,Benin,1988,,,tonnes,,,
4752,Benin,1989,,,tonnes,,,
4753,Benin,1990,,,tonnes,15.0,,


In [38]:
# export dataframe so it can be imported into sql
country_production.to_csv("country_production.csv")

In [39]:
# the next step is to merge this table with the country_region table
# so that data can be grouped by region

# import country_region table
filename = "country_region.csv"
country_region = pd.read_csv(filename)

In [40]:
country_region

Unnamed: 0,Country,Region
0,Andorra,Europe
1,United Arab Emirates,Middle east
2,Afghanistan,Asia & Pacific
3,Antigua and Barbuda,South/Latin America
4,Anguilla,South/Latin America
...,...,...
243,Guernsey,Europe
244,Isle of Man,Europe
245,Jersey,Europe
246,Saint Barthelemy,South/Latin America


In [41]:
# merge the two tables to get the region
production_country_region = pd.merge(country_production, country_region, how = "inner", left_on = "Country", right_on = "Country")

In [42]:
# drop the Country column
production_country_region = production_country_region.drop("Country", axis = 1)

In [43]:
production_country_region.columns

Index(['Year', 'Area_unit', 'Area_harvested', 'Production_unit', 'Production',
       'Yield_unit', 'Yield', 'Region'],
      dtype='object')

In [44]:
# some columns need to be summed and others averaged
calc = {'Area_unit':'first', 'Area_harvested':sum, 'Production_unit':'first',
       'Production':sum, 'Yield_unit':'first', 'Yield':'mean' }
production_region = production_country_region.groupby(["Region", "Year"]).agg(calc)

In [45]:
# reset index so table is easy to export and load
production_region = production_region.reset_index()


In [46]:
# export table
production_region.to_csv('region_production.csv', index=False)

In [47]:
from sqlalchemy import create_engine
from config import postgresql_pword

In [48]:
# create engine
engine = create_engine(f'postgresql://postgres:{postgresql_pword}@localhost:5432/ETL_project')
table_names = engine.table_names()
print(table_names)

['country_region', 'country_production']


  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
country_production.columns

Index(['Country', 'Year', 'Area_unit', 'Area_harvested', 'Production_unit',
       'Production', 'Yield_unit', 'Yield'],
      dtype='object')

In [51]:
country_production.to_sql(name="country_production", con=engine, if_exists='append', index=False)

In [56]:
li1=country_production["Country"]
li2=country_region["Country"]
def Diff(li1, li2):
    return list(set(li1) - set(li2)) + list(set(li2) - set(li1))

li3 = Diff(li1, li2)
print(li3)

['Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Bolivia (Plurinational State of)', 'Americas', 'Asia', 'Africa', 'Land Locked Developing Countries', 'Caribbean', 'China, Taiwan Province of', 'Western Africa', 'Small Island Developing States', 'United Republic of Tanzania', 'Middle Africa', 'Least Developed Countries', 'Eastern Africa', 'Central America', 'Melanesia', "Côte d'Ivoire", 'Democratic Republic of the Congo', 'South-eastern Asia', 'Polynesia', 'Venezuela (Bolivarian Republic of)', 'Oceania', 'Micronesia', 'South America', 'Southern Asia', 'Eastern Asia', 'Micronesia (Federated States of)', 'World', 'Senegal', 'Marshall Islands', 'Netherlands', 'Lesotho', 'Virgin Islands, British', 'Ukraine', 'Heard Island and McDonald Islands', 'Italy', 'Egypt', 'Iraq', 'Singapore', 'Andorra', 'Poland', 'Georgia', 'Gambia', 'Norfolk Island', 'Burundi', 'Bahamas', 'France, Metropolitan', 'Mali', 'Turkmenistan', 'Gibraltar', 'Aruba', 'French Guiana', 'Hungary',

In [57]:
len(li3)

221

In [None]:
new_trade_df.replace(to_replace={"Belgium-Luxembourg": "Belgium"}, inplace=True)
new_trade_df.replace(to_replace={"Venezuela (Bolivarian Republic of)": "Venezuela"}, inplace=True)
new_trade_df.replace(to_replace={"Iran (Islamic Republic of)": "Iran"}, inplace=True)
new_trade_df.replace(to_replace={"Bolivia (Plurinational State of)": "Bolivia"}, inplace=True)
new_trade_df.replace(to_replace={"Sudan (former)": "Sudan"}, inplace=True)
new_trade_df.replace(to_replace={"Democratic Republic of the Congo": "Congo, The Democratic Republic of the"}, inplace=True)
