In [1]:
import psycopg2
import os
import pandas as pd
from sqlalchemy import create_engine
import io

In [2]:
conn = psycopg2.connect(dbname="adsdb", user="adsdb", password="adsdb")
cur = conn.cursor()

In [3]:
cur.execute("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'trusted'""")
tables = cur.fetchall()
tables

[('notes_0',),
 ('demographiccountry_0',),
 ('demographic_0',),
 ('demographiclabels_0',),
 ('morticd_7',),
 ('morticd_8',),
 ('morticd_9',),
 ('morticd_10_1',),
 ('morticd_10_2',),
 ('morticd_10_3',),
 ('morticd_10_4',),
 ('morticd_10_5',),
 ('countrycodes_0',),
 ('population_0',)]

In [4]:
df_mortic = [pd.read_sql(f'SELECT * from trusted.{table[0]};', conn, params = dict(name="adsdb")) for table in tables if table[0].startswith('morticd_10')]

In [5]:
df_deaths = pd.concat(df_mortic)

In [6]:
df_demo = pd.read_sql('SELECT * from trusted.demographic_0;', conn, params = dict(name="adsdb"))
df_demo_labels = pd.read_sql('SELECT * from trusted.demographiclabels_0;', conn, params = dict(name="adsdb"))
df_country_deaths = pd.read_sql('SELECT * from trusted.countrycodes_0;', conn, params = dict(name="adsdb"))
df_country_demo = pd.read_sql('SELECT * from trusted.demographiccountry_0;', conn, params = dict(name="adsdb"))

In [7]:
c = ["Country", "Year", "Cause", "Deaths1"]
df_deaths = df_deaths[c]

In [8]:
df_mortality_long = df_deaths.pivot_table(index=("Year", "Country"), columns="Cause", aggfunc=sum)
df_mortality_long = df_mortality_long.loc[:, (['Deaths1'], ['AAA'])]
df_mortality_long = df_mortality_long.xs(key='AAA', level='Cause', axis=1)

In [9]:
df_mortality = df_mortality_long.join(df_country_deaths.set_index("Country"), on="Country")
df_demo = df_demo.set_index("CountryId").join(df_country_demo.set_index("Code"), on="CountryId")
df_demo = df_demo.set_index("IndicatorId").join(df_demo_labels.set_index("Indicator"), on="IndicatorId")

In [10]:
df_demo_pivoted = df_demo.set_index(["Year", "Name"]).pivot_table(index=("Year", "Name"), columns="Description", values="Value", aggfunc=sum)
df_final = df_mortality.join(df_demo_pivoted, on=("Year","Name"), how='inner').reset_index()

In [11]:
df_final= df_final.rename(columns={'Deaths1':'Deaths'})
df_final.describe()

Unnamed: 0,Year,Country,Deaths,"Fertility rate, total (births per woman)",GDP (constant LCU),GDP (current LCU),GDP (current US$),GDP deflator (base year varies by country),GDP growth (annual %),GDP per capita (current LCU),...,Population aged 15-24 years (thousands),Population aged 25-64 years (thousands),Population aged 65 years or older (thousands),Population growth (annual %),Poverty headcount ratio at $3.20 a day (PPP) (% of population),"Prevalence of HIV, total (% of population ages 15-49)",Price level ratio of PPP conversion factor (GDP) to market exchange rate,Rural population (% of total population),Total debt service (% of GNI),Total population (thousands)
count,2014.0,2014.0,1874.0,1821.0,1873.0,1942.0,1883.0,1873.0,1871.0,1942.0,...,1945.0,1945.0,1945.0,1911.0,938.0,964.0,1841.0,1912.0,738.0,1945.0
mean,2007.7572,3163.839126,155128.6,2.024317,49913240000000.0,55653850000000.0,469483800000.0,103.714148,3.18797,1284477.0,...,3259.693328,10611.824621,2306.855596,0.992558,7.841045,0.737967,0.664597,32.656063,6.214997,20918.296281
std,6.083468,1006.651091,343310.6,0.630276,336038100000000.0,546422800000000.0,1621040000000.0,110.65962,3.793484,7621454.0,...,6552.406557,22008.631253,5363.514135,1.512831,11.608127,2.246074,0.314318,19.434909,5.257849,42460.048235
min,1994.0,1125.0,12.0,0.918,128000000.0,97020000.0,63101270.0,3.36751,-14.83861,264.9452,...,0.471,2.618,0.31,-3.84767,0.0,0.1,0.13555,0.0,0.09469,4.425
25%,2003.0,2230.0,3623.0,1.5251,23363830000.0,17450520000.0,10154680000.0,73.12882,1.26376,16169.97,...,97.255,317.076,49.817,0.20873,0.4,0.1,0.43102,17.3955,3.0987,617.86
50%,2008.0,3190.0,36861.0,1.86,347676800000.0,284090900000.0,52023500000.0,92.24561,3.19185,35971.2,...,799.827,2526.499,537.101,0.82659,1.7,0.3,0.5948,31.717,4.93059,5292.118
75%,2013.0,4182.0,123072.5,2.453,2437285000000.0,1826261000000.0,271461900000.0,103.79032,5.114805,118421.0,...,3125.066,9438.923,1827.677,1.496135,12.1,0.5,0.86957,46.01275,7.706363,18991.431
max,2019.0,5198.0,2813503.0,4.669,6916081000000000.0,1.315126e+16,19542980000000.0,2054.76642,26.17025,165291600.0,...,44640.324,169460.902,50126.008,17.51221,72.8,19.3,1.85566,81.804,58.42308,325084.756


In [12]:
df_final.columns

Index(['Year', 'Country', 'Deaths', 'Name',
       'Fertility rate, total (births per woman)', 'GDP (constant LCU)',
       'GDP (current LCU)', 'GDP (current US$)',
       'GDP deflator (base year varies by country)', 'GDP growth (annual %)',
       'GDP per capita (current LCU)', 'GDP per capita (current US$)',
       'GDP per capita, PPP (constant 2011 international $)',
       'GDP per capita, PPP (current international $)',
       'GDP, PPP (constant 2011 international $)',
       'GDP, PPP (current international $)', 'GNI (current LCU)',
       'GNI per capita (current LCU)',
       'GNI per capita, Atlas method (current US$)',
       'GNI per capita, PPP (current international $)',
       'General government total expenditure (current LCU)',
       'Life expectancy at birth, total (years)',
       'Mortality rate, infant (per 1,000 live births)',
       'Official exchange rate (LCU per US$, period average)',
       'PPP conversion factor, GDP (LCU per international $)',
       '

In [13]:
cur.execute("CREATE SCHEMA IF NOT EXISTS exploitation;")

In [14]:
conn.commit()

In [15]:
engine = create_engine('postgresql+psycopg2://adsdb:adsdb@localhost:5432/adsdb')
df_final.to_sql('demographic_deaths', engine, if_exists='replace',index=False, schema='exploitation')