In [2]:
# Dependencies and Setup
import pandas as pd
import glob
import os
from bs4 import BeautifulSoup as bs
import requests
import pymongo
import time
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
from sqlalchemy import create_engine

pd.options.mode.chained_assignment = None
default='warn'

In [3]:
overview_url = 'https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/income-and-work-census/latest-release'

In [4]:
# Use Panda's `read_html` to parse the url
tables = pd.read_html(overview_url)

In [5]:
# Find the relevant dataframe
summary_df = pd.DataFrame(tables[0])
summary_df

Unnamed: 0.1,Unnamed: 0,Median income,Negative income,Nil income,$1-$149,$150-$299,$300-$399,$400-$499,$500-$649,$650-$799,$800-$999,"$1,000-$1,249","$1,250-$1,499","$1,500-$1,749","$1,750-$1,999","$2,000-$2,999","$3,000 or more"
0,New South Wales,813,48024,558019,199715,301947,497863,497263,472795,453482,518187,580866,436915,379950,289392,519595,373538
1,Victoria,803,40097,462585,181402,259576,393575,398650,378559,366570,437356,483105,365266,321293,229793,392855,263112
2,Queensland,787,28996,303632,140223,201611,325481,326763,322430,306103,360782,384211,280697,242883,177863,291027,170135
3,South Australia,734,9293,103721,50776,81493,131617,135058,121526,111354,128716,141776,101079,82849,58086,87386,47071
4,Western Australia,848,14775,175789,74433,106129,153779,149720,146826,135161,155856,182352,143853,132068,100717,190276,127181
5,Tasmania,701,3018,29744,14983,25490,44367,46543,40553,37358,41024,44639,30046,24860,17270,23812,11468
6,Northern Territory,936,1319,11169,6751,15308,11024,9485,8923,8737,12176,16339,13793,13095,9734,16841,7360
7,Australian Capital Territory,1203,1238,24751,11408,13363,15655,17863,19285,19895,26161,33173,30992,31827,25579,53018,29121
8,Australia(b),805,146805,1669570,679812,1005072,1573594,1581612,1511161,1438936,1680606,1866801,1402863,1229021,908593,1575067,1029073


In [6]:
# Output File (CSV)
output_data_file = "state_income_summary.csv"

In [7]:
# Save the summary_df dataframe as a CSV
summary_df.to_csv(output_data_file, index=False)

In [8]:
summary_csv = pd.read_csv("state_income_summary.csv")
# summary_csv

In [9]:
summary_csv.rename(columns={'Unnamed: 0': 'State'}, inplace=True)
summary_csv

Unnamed: 0,State,Median income,Negative income,Nil income,$1-$149,$150-$299,$300-$399,$400-$499,$500-$649,$650-$799,$800-$999,"$1,000-$1,249","$1,250-$1,499","$1,500-$1,749","$1,750-$1,999","$2,000-$2,999","$3,000 or more"
0,New South Wales,813,48024,558019,199715,301947,497863,497263,472795,453482,518187,580866,436915,379950,289392,519595,373538
1,Victoria,803,40097,462585,181402,259576,393575,398650,378559,366570,437356,483105,365266,321293,229793,392855,263112
2,Queensland,787,28996,303632,140223,201611,325481,326763,322430,306103,360782,384211,280697,242883,177863,291027,170135
3,South Australia,734,9293,103721,50776,81493,131617,135058,121526,111354,128716,141776,101079,82849,58086,87386,47071
4,Western Australia,848,14775,175789,74433,106129,153779,149720,146826,135161,155856,182352,143853,132068,100717,190276,127181
5,Tasmania,701,3018,29744,14983,25490,44367,46543,40553,37358,41024,44639,30046,24860,17270,23812,11468
6,Northern Territory,936,1319,11169,6751,15308,11024,9485,8923,8737,12176,16339,13793,13095,9734,16841,7360
7,Australian Capital Territory,1203,1238,24751,11408,13363,15655,17863,19285,19895,26161,33173,30992,31827,25579,53018,29121
8,Australia(b),805,146805,1669570,679812,1005072,1573594,1581612,1511161,1438936,1680606,1866801,1402863,1229021,908593,1575067,1029073


In [10]:
income_summary_csv = summary_csv.drop(columns=["Negative income","Nil income", "$1-$149", "$150-$299", "$300-$399", "$400-$499", "$500-$649", "$650-$799", "$800-$999", "$1,000-$1,249", "$1,250-$1,499", "$1,500-$1,749", "$1,750-$1,999", "$2,000-$2,999", "$3,000 or more"])
income_summary_csv

Unnamed: 0,State,Median income
0,New South Wales,813
1,Victoria,803
2,Queensland,787
3,South Australia,734
4,Western Australia,848
5,Tasmania,701
6,Northern Territory,936
7,Australian Capital Territory,1203
8,Australia(b),805


In [11]:
income_summary_csv.rename(columns={'Unnamed: 0': 'State', 'Median income': 'Median Income'}, inplace=True)
income_summary_csv

Unnamed: 0,State,Median Income
0,New South Wales,813
1,Victoria,803
2,Queensland,787
3,South Australia,734
4,Western Australia,848
5,Tasmania,701
6,Northern Territory,936
7,Australian Capital Territory,1203
8,Australia(b),805


In [24]:
income_summary_csv.rename(columns={"Median Income":"Median_Income"}, inplace=True)

Population Data

In [12]:
population_url = "https://www.abs.gov.au/statistics/people/population/population-census/2021"

In [13]:
# Use Panda's `read_html` to parse the url
population_tables = pd.read_html(population_url)

In [14]:
# Find the relevant dataframe
population_summary_df = pd.DataFrame(population_tables[0])
population_summary_df

Unnamed: 0.1,Unnamed: 0,Male,Female,Total
0,New South Wales,3984166,4087995,8072163
1,Victoria,3200963,3302528,6503491
2,Queensland,2540404,2615736,5156138
3,South Australia,878592,902924,1781516
4,Western Australia,1322855,1337171,2660026
5,Tasmania,273765,283804,557571
6,Northern Territory,117526,115075,232605
7,Australian Capital Territory,224361,230140,454499
8,Australia(b),12545154,12877635,25422788


In [15]:
# Output File (CSV)
population_output_data_file = "state_population_summary.csv"

In [16]:
# Save the summary_df dataframe as a CSV
population_summary_df.to_csv(population_output_data_file, index=False)

In [17]:
population_summary_csv = pd.read_csv("state_population_summary.csv")
population_summary_csv

Unnamed: 0.1,Unnamed: 0,Male,Female,Total
0,New South Wales,3984166,4087995,8072163
1,Victoria,3200963,3302528,6503491
2,Queensland,2540404,2615736,5156138
3,South Australia,878592,902924,1781516
4,Western Australia,1322855,1337171,2660026
5,Tasmania,273765,283804,557571
6,Northern Territory,117526,115075,232605
7,Australian Capital Territory,224361,230140,454499
8,Australia(b),12545154,12877635,25422788


In [18]:
# Rename columns
population_summary_csv.rename(columns={'Unnamed: 0': 'State'}, inplace=True)
population_summary_csv

Unnamed: 0,State,Male,Female,Total
0,New South Wales,3984166,4087995,8072163
1,Victoria,3200963,3302528,6503491
2,Queensland,2540404,2615736,5156138
3,South Australia,878592,902924,1781516
4,Western Australia,1322855,1337171,2660026
5,Tasmania,273765,283804,557571
6,Northern Territory,117526,115075,232605
7,Australian Capital Territory,224361,230140,454499
8,Australia(b),12545154,12877635,25422788


In [None]:
# Reporting entity 	
# Facility name 	
# Type 	
# State 	
# Electricity production (GJ) 	
# Electricity production (MWh) 	
# Total scope 1 emissions (t CO2-e) 	
# Total scope 2 emissions (t CO2-e) 	
# Total emissions (t CO2-e) 	
# Emission intensity (t CO2-e/ MWh) 	
# Grid connected 	
# Grid 	
# Primary fuel
# Year

In [19]:
connection_string = "postgres:postgres@localhost:5432/energy_db"
engine = create_engine(f'postgresql://{connection_string}')

In [20]:
# Confirm tables
engine.table_names()

  engine.table_names()


['Aus_Income',
 'Aus_Population',
 '14_15_Energy_Production',
 '17_18_Energy_Production',
 '20_21_Energy_Production']

In [21]:
population_summary_csv.to_sql(name='Aus_Population', con=engine, if_exists='append', index=False)

9

In [25]:
income_summary_csv.to_sql(name='Aus_Income', con=engine, if_exists='append', index=False)

9