In [117]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [118]:
#load file into panda. make sure to add thousands=',' in read_csv as some of the figures contain commas
cocoa_daily = "../Resources/Cocoa Prices - Daily Prices 1970-2020.csv"
cocoa_monthly = "../Resources/Cocoa Prices - Monthly Average 1970-2020.csv"
cocoa_prod_indices="../Resources/production_indices.csv"
cocoa_daily_df = pd.read_csv(cocoa_daily, thousands=',')
cocoa_monthly_df = pd.read_csv(cocoa_monthly, thousands=',')
cocoa_prod_indices_df=pd.read_csv(cocoa_prod_indices, thousands=',')

In [119]:
print(cocoa_daily_df.dtypes)
print('---------------------')
print(cocoa_monthly_df.dtypes)
print('---------------------')
print(cocoa_prod_indices_df.dtypes)

Date                                  object
London futures (£ sterling/tonne)    float64
New York futures (US$/tonne)         float64
ICCO daily price (US$/tonne)         float64
ICCO daily price (Euro/tonne)        float64
dtype: object
---------------------
Month          object
Euro/tonne    float64
US$/tonne     float64
dtype: object
---------------------
Domain Code          object
Domain               object
Area Code             int64
Area                 object
Element Code          int64
Element              object
Item Code             int64
Item                 object
Year Code             int64
Year                  int64
Unit                 object
Value               float64
Flag                 object
Flag Description     object
dtype: object


### Transform premise DataFrame

In [120]:
cocoa_daily_df.head()

Unnamed: 0,Date,London futures (£ sterling/tonne),New York futures (US$/tonne),ICCO daily price (US$/tonne),ICCO daily price (Euro/tonne)
0,31/12/2020,1684.67,2544.33,2424.35,1982.39
1,30/12/2020,1683.67,2534.0,2412.99,1965.02
2,29/12/2020,1634.0,2459.33,2334.66,1905.67
3,28/12/2020,1640.0,2502.0,2355.17,1928.06
4,24/12/2020,1646.0,2537.33,2385.85,1958.07


In [121]:
cocoa_monthly_df.head()

Unnamed: 0,Month,Euro/tonne,US$/tonne
0,01/12/2020,1978.37,2407.2
1,01/11/2020,1992.32,2358.18
2,01/10/2020,1947.42,2292.06
3,01/09/2020,1985.17,2457.9
4,01/08/2020,1985.17,2348.68


In [122]:
cocoa_prod_indices_df.head()

Unnamed: 0,Domain Code,Domain,Area Code,Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,QI,Production Indices,7,Angola,432,Gross Production Index Number (2014-2016 = 100),661,"Cocoa, beans",1961,1961,index,92.74,Fc,Calculated data
1,QI,Production Indices,7,Angola,432,Gross Production Index Number (2014-2016 = 100),661,"Cocoa, beans",1962,1962,index,92.74,Fc,Calculated data
2,QI,Production Indices,7,Angola,432,Gross Production Index Number (2014-2016 = 100),661,"Cocoa, beans",1963,1963,index,69.55,Fc,Calculated data
3,QI,Production Indices,7,Angola,432,Gross Production Index Number (2014-2016 = 100),661,"Cocoa, beans",1964,1964,index,69.55,Fc,Calculated data
4,QI,Production Indices,7,Angola,432,Gross Production Index Number (2014-2016 = 100),661,"Cocoa, beans",1965,1965,index,92.74,Fc,Calculated data


<strong>Cocoa Daily Price</strong>
- clean up cocoa_daily_df

In [123]:
# filter out cocoa_daily_df to only include prices in USD/tonnes
cocoa_daily_df=cocoa_daily_df[['Date','New York futures (US$/tonne)', 'ICCO daily price (US$/tonne)']]

In [124]:
#add 'year' and 'month/year' column in cocoa_daily_df 
cocoa_daily_df['year code']=cocoa_daily_df['Date'].str[-4:]
cocoa_daily_df['month/year code']=cocoa_daily_df['Date'].str[-7:]

cocoa_daily_df

# # the above will allow analysts to create another table to summarise cocoa prices on yearly & monthly basis (i.e average price, max, and min for each year)
# sample code
# cocoa_price_yearly_summary=round(cocoa_daily_df.groupby('year').agg({'New York futures (US$/tonne)' : ['mean', 'min', 'max'], 'ICCO daily price (US$/tonne)' : ['mean', 'min', 'max']}),2)
# cocoa_price_monthly_summary=round(cocoa_daily_df.groupby('month/year').agg({'New York futures (US$/tonne)' : ['mean', 'min', 'max'], 'ICCO daily price (US$/tonne)' : ['mean', 'min', 'max']}),2)


Unnamed: 0,Date,New York futures (US$/tonne),ICCO daily price (US$/tonne),year code,month/year code
0,31/12/2020,2544.33,2424.35,2020,12/2020
1,30/12/2020,2534.00,2412.99,2020,12/2020
2,29/12/2020,2459.33,2334.66,2020,12/2020
3,28/12/2020,2502.00,2355.17,2020,12/2020
4,24/12/2020,2537.33,2385.85,2020,12/2020
...,...,...,...,...,...
6736,07/10/1994,1307.67,1416.46,1994,10/1994
6737,06/10/1994,1322.33,1430.32,1994,10/1994
6738,05/10/1994,1326.00,1430.57,1994,10/1994
6739,04/10/1994,1358.33,1462.28,1994,10/1994


In [160]:
#create a table to show yearly average price, min price, & max price recorded in New York Futures
cocoa_NYF=cocoa_daily_df.copy()
del cocoa_NYF['ICCO daily price (US$/tonne)']
cocoa_NYF_yearly= round(cocoa_NYF.groupby('year code').agg(['mean', 'min', 'max']),2)
cocoa_NYF_yearly.columns = cocoa_NYF_yearly.columns.droplevel()

cocoa_NYF_yearly.head()

Unnamed: 0_level_0,mean,min,max
year code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1994,1324.02,1237.67,1391.33
1995,1350.05,1252.0,1474.67
1996,1370.41,1242.67,1466.67
1997,1544.44,1252.67,1783.33
1998,1601.63,1400.67,1762.33


In [161]:
#create a table to show yearly average price, min price, & max price recorded in ICCO
cocoa_ICCO=cocoa_daily_df.copy()
del cocoa_ICCO['New York futures (US$/tonne)']
cocoa_ICCO_yearly= round(cocoa_ICCO.groupby('year code').agg(['mean', 'min', 'max']),2)
cocoa_ICCO_yearly.columns = cocoa_ICCO_yearly.columns.droplevel()

cocoa_ICCO_yearly.head()

Unnamed: 0_level_0,mean,min,max
year code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1994,1428.25,1346.41,1497.14
1995,1433.32,1335.09,1568.62
1996,1455.81,1313.15,1572.89
1997,1620.86,1322.59,1834.68
1998,1675.51,1462.83,1830.04


In [163]:
#create a table to show monthly average price, min price, & max price recorded in New York Futures
cocoa_NYF_monthly= round(cocoa_NYF.groupby('month/year code').agg(['mean', 'min', 'max']),2)
cocoa_NYF_monthly.columns = cocoa_NYF_monthly.columns.droplevel()

cocoa_NYF_monthly.head()

Unnamed: 0_level_0,mean,min,max
month/year code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/1995,1368.54,1305.33,1408.67
01/1996,1287.65,1265.33,1310.33
01/1997,1361.29,1293.0,1415.67
01/1998,1612.79,1570.33,1667.0
01/1999,1383.05,1337.0,1436.67


In [164]:
#create a table to show monthly average price, min price, & max price recorded in ICCO
cocoa_ICCO_monthly= round(cocoa_ICCO.groupby('month/year code').agg(['mean', 'min', 'max']),2)
cocoa_ICCO_monthly.columns = cocoa_ICCO_monthly.columns.droplevel()

cocoa_ICCO_yearly.head()

Unnamed: 0_level_0,mean,min,max
year code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1994,1428.25,1346.41,1497.14
1995,1433.32,1335.09,1568.62
1996,1455.81,1313.15,1572.89
1997,1620.86,1322.59,1834.68
1998,1675.51,1462.83,1830.04


<strong> Production Indices </strong>
- based on http://www.fao.org/waicent/faostat/agricult/indices-e.htm, The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 1999-2001. They are based on the sum of price-weighted quantities of different agricultural commodities produced after deductions of quantities used as seed and feed weighted in a similar manner. The resulting aggregate represents, therefore, disposable production for any use except as seed and feed.
- clean up Production Indices table

In [125]:
prod_indices_df= cocoa_prod_indices_df[['Domain', 'Area Code', 'Area', 'Element', 'Item', 'Year', 'Unit', 'Value']]
prod_indices_df.head()

Unnamed: 0,Domain,Area Code,Area,Element,Item,Year,Unit,Value
0,Production Indices,7,Angola,Gross Production Index Number (2014-2016 = 100),"Cocoa, beans",1961,index,92.74
1,Production Indices,7,Angola,Gross Production Index Number (2014-2016 = 100),"Cocoa, beans",1962,index,92.74
2,Production Indices,7,Angola,Gross Production Index Number (2014-2016 = 100),"Cocoa, beans",1963,index,69.55
3,Production Indices,7,Angola,Gross Production Index Number (2014-2016 = 100),"Cocoa, beans",1964,index,69.55
4,Production Indices,7,Angola,Gross Production Index Number (2014-2016 = 100),"Cocoa, beans",1965,index,92.74


### Create database connection

In [126]:
connection_string = "postgres:postgres@localhost:5432/ETL_project"
engine = create_engine(f'postgresql://{connection_string}')

In [127]:
# Confirm tables
engine.table_names()

OperationalError: (psycopg2.OperationalError) FATAL:  password authentication failed for user "postgres"

(Background on this error at: http://sqlalche.me/e/e3q8)

### Load DataFrames into database

In [None]:
premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

In [None]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)