In [1]:
# loading required packages
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# creating a connection to the database
engine = create_engine('postgresql+psycopg2://postgres:1995@localhost:5432/faostat_ms_dsci_project')

# list of table names to be read
table_names = [
    'producer_prices', 'production_value', 'production_index', 'crop_production',
    'landuse', 'trade', 'trade_indicators', 'trade_indices', 'economic_indicators', 
    'foreign_investment', 'government_investment', 'credit_to_agri', 'emission_indicators', 
    'employment_indicators', 'annual_population', 'agri_trade_indicators', 
    'nitrogen_fertilizer', 'phosphorus_fertilizer', 'potassium_fertilizer', 
    'cropland_nutrient_balance', 'pesticide', 'pesticide_trade', 'temperature_change'
] 

# dictionary to hold tables
dfs = {name: pd.read_sql_table(name, engine) for name in table_names}

In [3]:
# this is how we can access individual tables
dfs["trade"].head()

Unnamed: 0,area_code,area,year_code,year,item_code,item,export_quantity,export_value,import_quantity,import_value
0,1,Armenia,1992,1992,10,Total Merchandise Trade,,12000.0,,200000.0
1,1,Armenia,1993,1993,10,Total Merchandise Trade,,30000.0,,200000.0
2,1,Armenia,1994,1994,10,Total Merchandise Trade,,232495.0,,393841.0
3,1,Armenia,1995,1995,10,Total Merchandise Trade,,270943.0,,673917.0
4,1,Armenia,1996,1996,10,Total Merchandise Trade,,290314.0,,855801.0


In [4]:
# set of keys to join the tables on
key_cols_1 = ['area_code', 'area', 'year_code', 'year', 'item_code', 'item']
key_cols_2 = ['area_code', 'area', 'year_code', 'year']

# list of table names (excluding 'producer_prices' and 'crop_production')
table_names = [
    'production_value', 'production_index',
    'landuse', 'trade', 'trade_indicators', 'trade_indices', 'economic_indicators', 
    'foreign_investment', 'government_investment', 'credit_to_agri', 'emission_indicators', 
    'employment_indicators', 'annual_population', 'agri_trade_indicators', 
    'nitrogen_fertilizer', 'phosphorus_fertilizer', 'potassium_fertilizer', 
    'cropland_nutrient_balance', 'pesticide', 'pesticide_trade', 'temperature_change'
] 

# list of table names (excluding 'producer_prices' and 'crop_production') 
# that are to be joined based on key_cols_1
table_group_1 = [
    'production_value', 'production_index',
    'trade', 'trade_indicators', 'trade_indices' 
]

# joining tables to form an integrated table/dataframe
df_merged = dfs['producer_prices'].merge(dfs['crop_production'], on=key_cols_1, how='left')
for table in table_names:
    if table in table_group_1:
        df_merged = df_merged.merge(dfs[table], on=key_cols_1, how='left')
    else:
        df_merged = df_merged.merge(dfs[table], on=key_cols_2, how='left')

df_merged.head()

Unnamed: 0,area_code,area,year_code,year,item_code,item,producer_price,producer_price_index,area_harvested,laying,...,total_pesticide_use_per_value_of_agri_production,total_pesticide_export_quantity,total_pesticide_export_value,total_pesticide_import_quantity,total_pesticide_import_value,temp_change_dec_jan_feb,temp_change_mar_apr_may,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_meteorological_year
0,1,Armenia,1997,1997,15,Wheat,225.3,89.4,108000.0,,...,0.05,3.604,3.687,117.861,298.379,2.166,-0.319,0.87,0.184,0.725
1,1,Armenia,1998,1998,15,Wheat,205.6,83.91,118300.0,,...,0.06,3.604,3.687,207.171,340.124,-0.259,1.483,2.145,2.027,1.349
2,1,Armenia,1999,1999,15,Wheat,177.5,76.79,110129.0,,...,0.07,3.604,3.687,296.481,381.869,3.535,0.814,1.611,0.264,1.556
3,1,Armenia,2000,2000,15,Wheat,163.5,71.3,106581.0,,...,0.08,3.604,3.687,160.902,482.955,1.148,0.723,2.166,0.364,1.1
4,1,Armenia,2001,2001,15,Wheat,166.0,74.51,108554.0,,...,0.09,1.145,9.068,200.884,698.022,1.818,2.072,1.578,0.409,1.469


In [5]:
df_merged.shape

(392856, 117)

In [6]:
# exporting integrated dataset as csv file
df_merged.to_csv('/Users/gurjitsingh/Desktop/MS Data Science/MS_Project_Python/integrated_data.csv', index=False)