## Download suburb convert to postcode file

In [1]:
# download train station dataset
import gdown

url_suburb_to_postcode = 'https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv'
output_convert = '../data/raw/suburb_to_postcode.csv'
gdown.download(url_suburb_to_postcode, output_convert, quiet=False)

Downloading...
From: https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv
To: /Users/guozixuan/Desktop/generic-real-estate-consulting-project-group-7/data/raw/suburb_to_postcode.csv
100%|██████████████████████████████████████| 7.56M/7.56M [00:01<00:00, 5.70MB/s]


'../data/raw/suburb_to_postcode.csv'

## Read csv file

In [2]:
# Load pandas
import pandas as pd

# Read CSV file into DataFrame df
df_convert = pd.read_csv("../data/raw/suburb_to_postcode.csv", index_col=0)
df_income = pd.read_csv("../data/raw/income.csv", index_col=0)

In [3]:
# Do the preprocessing with convertion file
# Take rows for victoria
df_convert = df_convert[df_convert["state"] =="VIC"]

# Only remain the column which are postcode and locality
df_convert = df_convert[["postcode", "locality"]]

# Reset the index
df_convert = df_convert.reset_index()

In [4]:
df_convert = df_convert[df_convert['postcode']>=3000]
df_convert = df_convert[df_convert['postcode']<4000]

In [5]:
df_convert

Unnamed: 0,id,postcode,locality
0,4746,3000,MELBOURNE
1,4747,3001,MELBOURNE
2,4748,3002,EAST MELBOURNE
3,4749,3003,WEST MELBOURNE
4,4750,3004,MELBOURNE
...,...,...,...
3504,8520,3995,WATTLE BANK
3505,8521,3995,WONTHAGGI
3506,8522,3995,WOOLAMAI
3507,8523,3996,INVERLOCH


In [6]:
# Change the suburb column of income to UPPER case for merge since the locality of convertion file are upper case
df_income['Suburb'] = df_income['Suburb'].str.upper()

In [7]:
df_income

Unnamed: 0,Rank,Suburb,Value
0,1,CANTERBURY,$2352
1,2,PARK ORCHARDS,$2329
2,3,WONGA PARK,$2221
3,4,BRIGHTON,$2200
4,5,CAMBERWELL,$2122
...,...,...,...
660,261,ST ARNAUD,$713
661,262,MARYBOROUGH,$689
662,263,ROSEBUD WEST,$641
663,264,CARLTON,$633


In [8]:
# make sure locality and suburb are the same data type
df_convert.loc[:,"locality"] = df_convert["locality"].astype(str).str.strip()
df_income.loc[:,"Suburb"] = df_income["Suburb"].astype(str).str.strip()

# inner merge by suburb and locality
df_merge = df_income.merge(df_convert, left_on='Suburb', right_on='locality', how='inner')
df_merge

Unnamed: 0,Rank,Suburb,Value,id,postcode,locality
0,1,CANTERBURY,$2352,6722,3126,CANTERBURY
1,2,PARK ORCHARDS,$2329,4714,3114,PARK ORCHARDS
2,3,WONGA PARK,$2221,4715,3115,WONGA PARK
3,4,BRIGHTON,$2200,6601,3186,BRIGHTON
4,5,CAMBERWELL,$2122,4731,3124,CAMBERWELL
...,...,...,...,...,...,...
691,261,ST ARNAUD,$713,7732,3478,ST ARNAUD
692,262,MARYBOROUGH,$689,7953,3465,MARYBOROUGH
693,263,ROSEBUD WEST,$641,8627,3940,ROSEBUD WEST
694,264,CARLTON,$633,4867,3053,CARLTON


In [9]:
# Only remain the column which are postcode and it's corresponding income
df_merge_select = df_merge[["Value","postcode"]]

In [10]:
# Remove the $ symbol
df_merge_select['Value'] = df_merge_select['Value'].str[1:]
df_merge_select

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge_select['Value'] = df_merge_select['Value'].str[1:]


Unnamed: 0,Value,postcode
0,2352,3126
1,2329,3114
2,2221,3115
3,2200,3186
4,2122,3124
...,...,...
691,713,3478
692,689,3465
693,641,3940
694,633,3053


In [11]:
# Convert the income from string to float
df_merge_select["Value"] = pd.to_numeric(df_merge_select.loc[:,"Value"]).values.tolist()

# Group by by the postcode(average each postcode)
df_group_by = df_merge_select.groupby(['postcode']).median()
df_group_by = df_group_by.reset_index()
df_group_by

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge_select["Value"] = pd.to_numeric(df_merge_select.loc[:,"Value"]).values.tolist()


Unnamed: 0,postcode,Value
0,3000,994.0
1,3001,994.0
2,3002,1989.0
3,3004,994.0
4,3006,1838.0
...,...,...
321,3976,1165.0
322,3977,1297.0
323,3981,1115.0
324,3995,814.0


In [12]:
# Check if there any postcode that have property but do not have income data
domain_data = pd.read_csv("../data/raw/domain_cleaned.csv")
property_postcode = domain_data['postcode'].unique()
income_postcode = df_group_by['postcode'].unique()
inputation_postcode = [i for i in property_postcode if i not in income_postcode]

In [13]:
# Create a new dataframe for postcode that not included in the income data
df_inputation = pd.DataFrame(inputation_postcode, columns=['postcode'])
df_inputation['Value'] = df_inputation.apply(lambda _: '', axis=1)
# Do the median inputation for those postcode
df_inputation = df_inputation.assign(Value=df_group_by['Value'].median())
df_inputation

Unnamed: 0,postcode,Value
0,3003,1219.5
1,3008,1219.5
2,3027,1219.5
3,3167,1219.5
4,3336,1219.5
5,3750,1219.5
6,3754,1219.5
7,3809,1219.5
8,3975,1219.5
9,3978,1219.5


In [14]:
# Concat two dataframe to get the complete income dataframe
income_data = [df_group_by, df_inputation]
df_all_income = pd.concat(income_data)
df_all_income.reset_index()
df_all_income

Unnamed: 0,postcode,Value
0,3000,994.0
1,3001,994.0
2,3002,1989.0
3,3004,994.0
4,3006,1838.0
...,...,...
5,3750,1219.5
6,3754,1219.5
7,3809,1219.5
8,3975,1219.5


In [15]:
# Save to csv
df_all_income.to_csv("../data/raw/income_by_postcode.csv")