In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import sem
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy import MetaData
from sqlalchemy import Table
from sqlalchemy import Column



# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# File to Load (Remember to Change These)
freedom_index_csv = "C:\\Users\\JMadd\\git\\ETL_Project\\freedom.csv"

freedom_df = pd.read_csv(freedom_index_csv)


In [2]:
freedom_df.head()

Unnamed: 0,year,ISO_code,countries,region,pf_rol_procedural,pf_rol_civil,pf_rol_criminal,pf_rol,pf_ss_homicide,pf_ss_disappearances_disap,...,ef_regulation_business_bribes,ef_regulation_business_licensing,ef_regulation_business_compliance,ef_regulation_business,ef_regulation,ef_score,ef_rank,hf_score,hf_rank,hf_quartile
0,2016,ALB,Albania,Eastern Europe,6.661503,4.547244,4.666508,5.291752,8.920429,10.0,...,4.050196,7.324582,7.074366,6.705863,6.906901,7.54,34.0,7.56814,48.0,2.0
1,2016,DZA,Algeria,Middle East & North Africa,,,,3.819566,9.456254,10.0,...,3.765515,8.523503,7.029528,5.676956,5.268992,4.99,159.0,5.135886,155.0,4.0
2,2016,AGO,Angola,Sub-Saharan Africa,,,,3.451814,8.06026,5.0,...,1.94554,8.096776,6.782923,4.930271,5.5185,5.17,155.0,5.640662,142.0,4.0
3,2016,ARG,Argentina,Latin America & the Caribbean,7.098483,5.79196,4.34393,5.744791,7.622974,10.0,...,3.260044,5.253411,6.508295,5.535831,5.369019,4.84,160.0,6.469848,107.0,3.0
4,2016,ARM,Armenia,Caucasus & Central Asia,,,,5.003205,8.80875,10.0,...,4.575152,9.319612,6.491481,6.79753,7.378069,7.57,29.0,7.241402,57.0,2.0


In [3]:
#select only info that I intend to use
freedom_df = freedom_df[['year', 'countries', 'pf_rol_civil', 'pf_rol_criminal', 'pf_ss_homicide', 'pf_ss_disappearances_disap', 'pf_movement_women', 'pf_movement', 'pf_religion', 'pf_association_political', 'pf_expression_killed', 'pf_expression_jailed', 'pf_expression_influence', 'pf_expression', 'pf_identity_sex', 'pf_identity_divorce', 'ef_legal_protection', 'ef_legal_military', 'ef_trade_movement', 'ef_regulation_credit_ownership']].copy()


In [4]:
#change names of columns so that they are readable
freedom_df = freedom_df.rename(columns={"pf_rol_civil": "Civil justice", "pf_rol_criminal": "Criminal justice", 'pf_ss_homicide': 'Homicide', 'pf_ss_disappearances_disap': 'Disappearances', 'pf_movement_women': 'Womens movement', 'pf_movement': 'Freedom of movement', 'pf_religion': 'Religious freedom', 'pf_association_political': 'Freedom to establish and operate political parties', 'pf_expression_killed': 'Press killed', 'pf_expression_jailed': 'Press jailed', 'pf_expression_influence': 'regulations influence media', 'pf_expression': 'Freedom of expression', 'pf_identity_sex': 'Same-sex ralitionships', 'pf_identity_divorce': 'Divorce', 'ef_legal_protection': 'Protection of property', 'ef_legal_military': 'Military interference in law',
                   'ef_trade_movement': 'Control over movement of people and goods', 'ef_regulation_credit_ownership': 'Ownership of Banks'})

In [5]:
freedom_df.head()

Unnamed: 0,year,countries,Civil justice,Criminal justice,Homicide,Disappearances,Womens movement,Freedom of movement,Religious freedom,Freedom to establish and operate political parties,Press killed,Press jailed,regulations influence media,Freedom of expression,Same-sex ralitionships,Divorce,Protection of property,Military interference in law,Control over movement of people and goods,Ownership of Banks
0,2016,Albania,4.547244,4.666508,8.920429,10.0,5.0,6.666667,9.192593,10.0,10.0,10.0,5.0,8.607143,10.0,5.0,4.512228,8.333333,6.406138,5.0
1,2016,Algeria,,,9.456254,10.0,5.0,5.0,4.944815,5.0,10.0,10.0,2.666667,7.380952,0.0,0.0,4.689952,4.166667,1.590362,0.0
2,2016,Angola,,,8.06026,5.0,10.0,5.0,8.786667,2.5,10.0,10.0,2.666667,6.452381,0.0,10.0,2.512364,3.333333,2.044823,8.0
3,2016,Argentina,5.79196,4.34393,7.622974,10.0,10.0,10.0,7.795926,5.0,10.0,10.0,5.666667,8.738095,10.0,10.0,4.255995,7.5,4.697482,5.0
4,2016,Armenia,,,8.80875,10.0,10.0,6.666667,6.222222,5.0,10.0,10.0,3.333333,7.154762,10.0,5.0,5.664317,5.833333,6.830998,10.0


In [6]:
#freedom_df.set_index('year', inplace=True)
#freedom_df.head()

In [7]:
dfObj = freedom_df.sort_values(by =['countries', 'year']).reset_index()

freedom_df = pd.DataFrame(dfObj)
freedom_df.head()

Unnamed: 0,year,countries,Civil justice,Criminal justice,Homicide,Disappearances,Womens movement,Freedom of movement,Religious freedom,Freedom to establish and operate political parties,Press killed,Press jailed,regulations influence media,Freedom of expression,Same-sex ralitionships,Divorce,Protection of property,Military interference in law,Control over movement of people and goods,Ownership of Banks
0,2008,Albania,5.074785,4.100516,8.756539,10.0,10.0,10.0,9.195,,10.0,10.0,4.666667,7.666667,10.0,,3.717371,8.333333,2.109925,5.0
1,2009,Albania,5.074785,4.100516,8.852373,10.0,10.0,10.0,9.791667,,10.0,10.0,4.666667,7.666667,10.0,,3.913393,8.333333,3.624133,5.0
2,2010,Albania,5.074785,4.100516,8.272417,10.0,10.0,10.0,9.416667,,10.0,10.0,4.666667,7.604167,10.0,,4.046221,8.333333,3.675338,5.0
3,2011,Albania,5.074785,4.100516,8.05922,10.0,5.0,6.666667,9.135185,8.75,10.0,10.0,4.666667,8.595238,10.0,,3.561092,8.333333,3.423734,5.0
4,2012,Albania,4.9,3.6,7.849344,10.0,5.0,6.666667,9.305556,8.75,10.0,10.0,5.0,8.678571,10.0,,3.075611,8.333333,3.216676,5.0


In [8]:
census_csv = "C:\\Users\\JMadd\\git\\ETL_Project\\Census_reshaped.csv"

census_df = pd.read_csv(census_csv)

In [9]:
census_df.head()

Unnamed: 0.1,Unnamed: 0,Table,Country,Year,country_code,crude_birth_rate,crude_death_rate,growth_rate,infant_mortality,infant_mortality_female,...,life_expectancy_male,midyear_population,mortality_rate_1to4,mortality_rate_1to4_female,mortality_rate_1to4_male,mortality_rate_under5,mortality_rate_under5_female,mortality_rate_under5_male,net_migration,rate_natural_increase
0,0,birth_death_growth_rates,Afghanistan,2008,AF,40.3,15.64,3.269,,,...,,,,,,,,,8.03,2.466
1,1,birth_death_growth_rates,Afghanistan,2009,AF,40.05,15.34,2.282,,,...,,,,,,,,,-1.9,2.471
2,2,birth_death_growth_rates,Afghanistan,2010,AF,39.77,15.09,2.143,,,...,,,,,,,,,-3.25,2.468
3,3,birth_death_growth_rates,Afghanistan,2011,AF,39.53,14.84,2.183,,,...,,,,,,,,,-2.87,2.469
4,4,birth_death_growth_rates,Afghanistan,2012,AF,39.3,14.59,2.22,,,...,,,,,,,,,-2.51,2.471


In [15]:
census_df = census_df.rename(columns={"Country": "countries", "Year": "year"})
census_df.drop(['Unnamed: 0'], axis=1)
census_df.head()

Unnamed: 0,Table,countries,year,country_code,crude_birth_rate,crude_death_rate,growth_rate,infant_mortality,infant_mortality_female,infant_mortality_male,...,life_expectancy_male,midyear_population,mortality_rate_1to4,mortality_rate_1to4_female,mortality_rate_1to4_male,mortality_rate_under5,mortality_rate_under5_female,mortality_rate_under5_male,net_migration,rate_natural_increase
0,birth_death_growth_rates,Afghanistan,2008,AF,40.30,15.64,3.269,,,,...,,,,,,,,,8.03,2.466
1,birth_death_growth_rates,Afghanistan,2009,AF,40.05,15.34,2.282,,,,...,,,,,,,,,-1.90,2.471
2,birth_death_growth_rates,Afghanistan,2010,AF,39.77,15.09,2.143,,,,...,,,,,,,,,-3.25,2.468
3,birth_death_growth_rates,Afghanistan,2011,AF,39.53,14.84,2.183,,,,...,,,,,,,,,-2.87,2.469
4,birth_death_growth_rates,Afghanistan,2012,AF,39.30,14.59,2.220,,,,...,,,,,,,,,-2.51,2.471
5,birth_death_growth_rates,Afghanistan,2013,AF,39.05,14.35,2.254,,,,...,,,,,,,,,-2.16,2.470
6,birth_death_growth_rates,Afghanistan,2014,AF,38.84,14.12,2.289,,,,...,,,,,,,,,-1.83,2.472
7,birth_death_growth_rates,Afghanistan,2015,AF,38.57,13.89,2.317,,,,...,,,,,,,,,-1.51,2.468
8,birth_death_growth_rates,Afghanistan,2016,AF,38.28,13.65,2.343,,,,...,,,,,,,,,-1.20,2.463
9,birth_death_growth_rates,Albania,2008,AL,11.37,5.29,-0.152,,,,...,,,,,,,,,-7.60,0.608


In [16]:
census_df.set_index(['year', 'countries'])
census_df.head()

Unnamed: 0.1,Unnamed: 0,Table,countries,year,country_code,crude_birth_rate,crude_death_rate,growth_rate,infant_mortality,infant_mortality_female,...,life_expectancy_male,midyear_population,mortality_rate_1to4,mortality_rate_1to4_female,mortality_rate_1to4_male,mortality_rate_under5,mortality_rate_under5_female,mortality_rate_under5_male,net_migration,rate_natural_increase
0,0,birth_death_growth_rates,Afghanistan,2008,AF,40.3,15.64,3.269,,,...,,,,,,,,,8.03,2.466
1,1,birth_death_growth_rates,Afghanistan,2009,AF,40.05,15.34,2.282,,,...,,,,,,,,,-1.9,2.471
2,2,birth_death_growth_rates,Afghanistan,2010,AF,39.77,15.09,2.143,,,...,,,,,,,,,-3.25,2.468
3,3,birth_death_growth_rates,Afghanistan,2011,AF,39.53,14.84,2.183,,,...,,,,,,,,,-2.87,2.469
4,4,birth_death_growth_rates,Afghanistan,2012,AF,39.3,14.59,2.22,,,...,,,,,,,,,-2.51,2.471


In [22]:
new_df = pd.merge(freedom_df, census_df,  how='left', on=['year', 'countries'])
census_df.drop(['Unnamed: 0'], axis=1)
new_df.head()

Unnamed: 0,year,countries,Civil justice,Criminal justice,Homicide,Disappearances,Womens movement,Freedom of movement,Religious freedom,Freedom to establish and operate political parties,...,life_expectancy_male,midyear_population,mortality_rate_1to4,mortality_rate_1to4_female,mortality_rate_1to4_male,mortality_rate_under5,mortality_rate_under5_female,mortality_rate_under5_male,net_migration,rate_natural_increase
0,2008,Albania,5.074785,4.100516,8.756539,10.0,10.0,10.0,9.195,,...,,,,,,,,,-7.6,0.608
1,2008,Albania,5.074785,4.100516,8.756539,10.0,10.0,10.0,9.195,,...,,2984121.0,,,,,,,,
2,2008,Albania,5.074785,4.100516,8.756539,10.0,10.0,10.0,9.195,,...,75.5,,2.31,2.26,2.36,15.4,14.17,16.5,,
3,2009,Albania,5.074785,4.100516,8.852373,10.0,10.0,10.0,9.791667,,...,,,,,,,,,-5.48,0.593
4,2009,Albania,5.074785,4.100516,8.852373,10.0,10.0,10.0,9.791667,,...,,2982540.0,,,,,,,,


In [23]:
connection_string = "postgres:pineapple@localhost:5432/ETL_Database"
engine = create_engine(f'postgresql://{connection_string}')
new_df.to_sql('countries', con=engine)
engine.execute("SELECT * FROM countries").fetchall()

[(0, 2008, 'Albania', 5.074784651, 4.100515809, 8.756539449, 10.0, 10.0, 10.0, 9.195, None, 10.0, 10.0, 4.666666667, 7.666666667, 10.0, None, 3.7173706810000002, 8.333333332999999, 2.109924676, 5.0, 9.0, 'birth_death_growth_rates', 'AL', 11.37, 5.29, -0.152, None, None, None, None, None, None, None, None, None, None, None, None, None, -7.6, 0.608),
 (1, 2008, 'Albania', 5.074784651, 4.100515809, 8.756539449, 10.0, 10.0, 10.0, 9.195, None, 10.0, 10.0, 4.666666667, 7.666666667, 10.0, None, 3.7173706810000002, 8.333333332999999, 2.109924676, 5.0, 2038.0, 'midyear_population', 'AL', None, None, None, None, None, None, None, None, None, 2984121.0, None, None, None, None, None, None, None, None),
 (2, 2008, 'Albania', 5.074784651, 4.100515809, 8.756539449, 10.0, 10.0, 10.0, 9.195, None, 10.0, 10.0, 4.666666667, 7.666666667, 10.0, None, 3.7173706810000002, 8.333333332999999, 2.109924676, 5.0, 4081.0, 'mortality_life_expectancy', 'AL', None, None, None, 13.12, 11.94, 14.17, 77.93, 80.68, 75.5,

In [24]:
engine.table_names()

['countries']