In [1]:
import pandas as pd
import numpy as np

In [2]:
# read csv file
income = pd.read_csv('../data/tables/income.csv').drop(['Negative income','Not stated','Not applicable'],axis=1)

In [3]:
# change column names
income = income.set_axis(['INCP Total Personal Income (weekly)','0', '75', '225', '350', '450','575','725','900','1125','1375','1625','1875','2500','3500','Total'], axis=1, inplace=False)

In [4]:
# to get mean of income for each postcode
mean_output = []
for i in range(len(income)):
    cols = []
    for j in range(1,len(income.columns)-1):
        cols.append(income.iloc[i,j]*int(income.columns[j]))
    mean_output.append("{:.2f}".format(sum(cols)/income.iloc[i,-1]))


  mean_output.append("{:.2f}".format(sum(cols)/income.iloc[i,-1]))


In [5]:
# create a new dataframe to store all statistics of salary by postcode
final = pd.DataFrame()
final['postcode'] = income['INCP Total Personal Income (weekly)']
final['average_salary'] = mean_output

In [6]:
final[final['average_salary']=='0.00']

Unnamed: 0,postcode,average_salary
109,"2129, NSW",0.0
1318,"4009, QLD",0.0
2164,"6090, WA",0.0
2171,"6106, WA",0.0
2216,"6182, WA",0.0
2363,"6452, WA",0.0
2465,"6731, WA",0.0


In [7]:
final[final['average_salary'] == "nan"]

Unnamed: 0,postcode,average_salary
104,"2123, NSW",
674,"3062, VIC",
1431,"4222, QLD",
1532,"4475, QLD",
1750,"5005, SA",
2089,"5950, SA",
2453,"6711, WA",
2466,"6733, WA",
2478,"7001, TAS",
2513,"7139, TAS",


In [8]:
# 0.00 values are due to no valid statistic in such postcode
income[final['average_salary'] == '0.00']

Unnamed: 0,INCP Total Personal Income (weekly),0,75,225,350,450,575,725,900,1125,1375,1625,1875,2500,3500,Total
109,"2129, NSW",0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
1318,"4009, QLD",0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
2164,"6090, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,8
2171,"6106, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,25
2216,"6182, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2363,"6452, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2465,"6731, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,3


In [9]:
# nan values are due to no salary statistic in such postcode
income[final['average_salary']=='nan']

Unnamed: 0,INCP Total Personal Income (weekly),0,75,225,350,450,575,725,900,1125,1375,1625,1875,2500,3500,Total
104,"2123, NSW",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
674,"3062, VIC",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1431,"4222, QLD",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1532,"4475, QLD",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1750,"5005, SA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2089,"5950, SA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2453,"6711, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2466,"6733, WA",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2478,"7001, TAS",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2513,"7139, TAS",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
# replace all nan to 0.00
final.average_salary = final.average_salary.str.replace('nan', '0.00')

In [11]:
# to get 5 quantiles of the salary in each postcode
Q0_list = []
Q1_list = []
Q2_list = []
Q3_list = []
Q4_list = []
for i in range(len(income)):
    cols = []
    for j in range(1, len(income.columns)-1):
        for k in range(income.iloc[i,j]):
            cols.append(int(income.columns[j]))
    if all(item == 0 for item in cols):
        Q0_list.append("0.00")
        Q1_list.append("0.00")
        Q2_list.append("0.00")
        Q3_list.append("0.00")
        Q4_list.append("0.00")    
    else:
        Q0_list.append("{:.2f}".format(np.quantile(cols,0.1)))
        Q1_list.append("{:.2f}".format(np.quantile(cols,0.25)))
        Q2_list.append("{:.2f}".format(np.quantile(cols,0.5)))
        Q3_list.append("{:.2f}".format(np.quantile(cols,0.75)))
        Q4_list.append("{:.2f}".format(np.quantile(cols,0.9)))

    

In [12]:
# save the quantiles to the final table
final['median_salary'] = Q2_list
final['Q0_salary'] = Q0_list
final['Q1_salary'] = Q1_list
final['Q3_salary'] = Q3_list
final['Q4_salary'] = Q4_list

In [16]:
# keep postcode numbers only, without state name 
final['postcode'] = final.postcode.str.extract('(\d+)')
# Change the quantiles and mean to float, instead of string
final['average_salary'] = final['average_salary'].astype(float)
final['median_salary'] = final['median_salary'].astype(float)
final['Q0_salary'] = final['Q0_salary'].astype(float)
final['Q1_salary'] = final['Q1_salary'].astype(float)
final['Q3_salary'] = final['Q3_salary'].astype(float)
final['Q4_salary'] = final['Q4_salary'].astype(float)

In [18]:
final

Unnamed: 0,postcode,average_salary,median_salary,Q0_salary,Q1_salary,Q3_salary,Q4_salary
0,2000,794.83,575.0,0.0,350.0,1375.0,2500.0
1,2006,205.55,75.0,0.0,0.0,225.0,450.0
2,2007,539.31,350.0,0.0,0.0,900.0,1625.0
3,2008,681.00,575.0,0.0,0.0,1125.0,1875.0
4,2009,1110.81,1125.0,75.0,450.0,2500.0,3500.0
...,...,...,...,...,...,...,...
2648,2913,841.63,1125.0,75.0,450.0,1625.0,2500.0
2649,2914,834.74,1125.0,0.0,450.0,1875.0,2500.0
2650,2899,542.66,575.0,225.0,350.0,900.0,1375.0
2651,6798,702.21,1125.0,0.0,575.0,1875.0,2500.0


In [20]:
final.to_csv('../data/curated/income_cleaned.csv', index= False)

In [19]:
final[final['Q1_salary'] == 0.00]

Unnamed: 0,postcode,average_salary,median_salary,Q0_salary,Q1_salary,Q3_salary,Q4_salary
1,2006,205.55,75.0,0.0,0.0,225.0,450.0
2,2007,539.31,350.0,0.0,0.0,900.0,1625.0
3,2008,681.0,575.0,0.0,0.0,1125.0,1875.0
24,2032,616.34,450.0,0.0,0.0,1125.0,1875.0
43,2052,231.75,75.0,0.0,0.0,350.0,725.0
90,2109,543.02,350.0,0.0,0.0,900.0,1625.0
104,2123,0.0,0.0,0.0,0.0,0.0,0.0
109,2129,0.0,0.0,0.0,0.0,0.0,0.0
240,2308,209.5,225.0,0.0,0.0,225.0,450.0
383,2522,209.58,225.0,0.0,0.0,350.0,575.0
