In [1]:
import pandas as pd
import numpy as np

### read data

In [2]:
model_data = pd.read_csv('../data/curated/model-data/model-data.csv', index_col=0)

# Read xlsx file into DataFrame df
xls = pd.ExcelFile('../data/external-raw-data/Quarterly median rents by local government area - March Quarter 2021.xlsx')
# set header
rent_data = pd.read_excel(xls, 'All Properties', header=[2,3])

# Read xlsx file into DataFrame df
xls_convert = pd.ExcelFile('../data/external-raw-data/lga_postcode_table.xlsx')
df_convert = pd.read_excel(xls_convert, 'lga_postcode_mappings')

### convert lga to postcode with past_rent_data

In [3]:
# Preprocessing for rent data file
# Change the Local Government Area column of rent data to UPPER case for merge
rent_data.loc[:, ('Unnamed: 1_level_0', 'Unnamed: 1_level_1')] = rent_data['Unnamed: 1_level_0']['Unnamed: 1_level_1'].str.upper()
# drop first column which is the region column that we do not need
rent_data = rent_data.drop([('Unnamed: 0_level_0', 'Unnamed: 0_level_1')], axis=1)

In [4]:
# convert multi index to simple index
rent_data.columns = rent_data.columns.map('_'.join)
rent_data.rename(columns = {'Unnamed: 1_level_0_Unnamed: 1_level_1':'LGA region'}, inplace = True)

In [5]:
# Only remain the data of Victoria
df_convert = df_convert[df_convert['State'] =='Victoria']
df_convert = df_convert[df_convert['Postcode']< 4000]
df_convert = df_convert[df_convert['Postcode']>= 3000]

# Change the LGA region column of convertion file to UPPER case for merge
df_convert['LGA region'] = df_convert['LGA region'].str.upper().str.strip()

In [6]:
# reset the index and drop the original index column
df_convert = df_convert.reset_index()
df_convert = df_convert.drop(['index'], axis=1)

In [7]:
# make sure LGA region for two dataframe are the same data type
rent_data.loc[:,"LGA region"] = rent_data.loc[:,"LGA region"].astype(str).str.strip()
df_convert.loc[:,"LGA region"] = df_convert.loc[:,"LGA region"].astype(str).str.strip()

# inner join merge by Local Government Area and LGA region
df_merge = df_convert.merge(rent_data, left_on="LGA region", right_on="LGA region", how='inner')

In [8]:
# group by by postcode by median rent of each postcode
past_rent_data = df_merge.groupby(['Postcode']).median().reset_index()

### check with model data and join it

In [9]:
postcode_list = list(model_data['postcode'].unique())

In [10]:
df_filtered = past_rent_data.loc[past_rent_data['Postcode'].isin(postcode_list)]

In [11]:
df_filtered.replace("none", np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.replace("none", np.nan, inplace=True)


In [12]:
df_filtered.rename(columns = {'Postcode':'postcode'}, inplace = True)

# selected new columns
column_names_select_external = ['postcode', 'Mar 2018_Median', 'Jun 2018_Median', 'Sep 2018_Median', 'Dec 2018_Median',
                                'Mar 2019_Median', 'Jun 2019_Median', 'Sep 2019_Median', 'Dec 2019_Median',
                                'Mar 2020_Median', 'Jun 2020_Median', 'Sep 2020_Median', 'Dec 2020_Median',
                                'Mar 2021_Median']

df_filtered = df_filtered[column_names_select_external]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns = {'Postcode':'postcode'}, inplace = True)


In [13]:
# join internal features with external features
model_data = pd.merge(model_data, df_filtered, on='postcode', how='outer')

In [14]:
counts =model_data.isna().sum()

In [15]:
counts

postcode                        0
street                          0
suburb                          0
latitude                        0
longitude                       0
bedrooms                        0
bathrooms                       0
parking                         0
property_type                   0
price                           0
postcode_property_count         0
public-service-duration         0
care-facility-duration          0
shopping-center-duration        0
train-station-duration          0
hospital-duration               0
CBD-duration                    0
emergency-service-duration      0
property-count                  0
elector-count                   0
crime-rate                      0
income                          0
pri-1-10                        0
pri-11-50                       0
pri-51-100                      0
pri-101-150                     0
pri-150+                        0
sec-1-10                        0
sec-11-50                       0
sec-51-100    

### group by postcode

In [22]:
model_data

Unnamed: 0,postcode,street,suburb,latitude,longitude,bedrooms,bathrooms,parking,property_type,price,...,Dec 2018_Median,Mar 2019_Median,Jun 2019_Median,Sep 2019_Median,Dec 2019_Median,Mar 2020_Median,Jun 2020_Median,Sep 2020_Median,Dec 2020_Median,Mar 2021_Median
0,3029,33 cindia crescent,TARNEIT,-37.836800,144.681992,3,2,2,House,410.0,...,370.0,370.0,370.0,370.0,380.0,375.0,370.0,370.0,370.0,370.0
1,3029,3 loire close,HOPPERS CROSSING,-37.858776,144.670990,4,2,2,House,420.0,...,370.0,370.0,370.0,370.0,380.0,375.0,370.0,370.0,370.0,370.0
2,3029,5 toscana,TRUGANINA,-37.831726,144.721200,4,2,2,New House & Land,440.0,...,370.0,370.0,370.0,370.0,380.0,375.0,370.0,370.0,370.0,370.0
3,3029,12 sussex court,TARNEIT,-37.850758,144.712234,3,3,1,House,415.0,...,370.0,370.0,370.0,370.0,380.0,375.0,370.0,370.0,370.0,370.0
4,3029,4a craig close,TRUGANINA,-37.843980,144.717514,3,2,1,House,410.0,...,370.0,370.0,370.0,370.0,380.0,375.0,370.0,370.0,370.0,370.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12122,3934,15/26 green island avenue,MOUNT MARTHA,-38.248100,145.041611,3,2,2,Townhouse,625.0,...,,,,,,,,,,
12123,3934,8 raymond street,MOUNT MARTHA,-38.285440,144.995331,4,2,2,House,720.0,...,,,,,,,,,,
12124,3934,15 whitsunday ct,MOUNT MARTHA,-38.250070,145.052078,4,2,2,House,700.0,...,,,,,,,,,,
12125,3934,107 forest dr,MOUNT MARTHA,-38.290226,145.016418,3,2,2,House,870.0,...,,,,,,,,,,


In [27]:
model_data.to_csv("../data/curated/model-data/model-data-with-rent.csv")